Skip to content

Commit 00b5f51

Browse files
committed
HIVE-29651: Update ZookeeperExternalSessionsRegistryClient to handle multiple HiveServer2 instances submitting DAGs concurrently to available Tez External Sessions
1 parent b8dc73f commit 00b5f51

4 files changed

Lines changed: 247 additions & 18 deletions

File tree

ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezExternalSessionState.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.hadoop.hive.conf.HiveConf;
2525
import org.apache.hadoop.hive.ql.exec.tez.monitoring.TezJobMonitor;
2626
import org.apache.hadoop.hive.ql.metadata.HiveException;
27+
import org.apache.hadoop.hive.ql.session.SessionState;
2728
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
2829
import org.apache.hadoop.security.Credentials;
2930
import org.apache.hadoop.yarn.api.records.ApplicationId;
@@ -133,6 +134,11 @@ public void close(boolean keepDagFilesDir) throws Exception {
133134
// We never close external sessions that don't have errors.
134135
try {
135136
if (externalAppId != null) {
137+
LOG.info("Returning external session with appID: {}", externalAppId);
138+
SessionState sessionState = SessionState.get();
139+
if (sessionState != null) {
140+
sessionState.setTezSession(null);
141+
}
136142
registry.returnSession(externalAppId);
137143
}
138144
} catch (Exception e) {

ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionPoolManager.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,20 @@ void returnSession(TezSession tezSessionState) {
353353
+ " belongs to the pool. Put it back in");
354354
defaultSessionPool.returnSession((TezSessionPoolSession)tezSessionState);
355355
}
356+
357+
if (useExternalSessions) {
358+
if (tezSessionState.getTezClient() != null
359+
&& tezSessionState.getTezClient().getAppMasterApplicationId() != null) {
360+
try {
361+
tezSessionState.close(false);
362+
} catch (Exception ex) {
363+
LOG.warn("Failed to return external Tez session {}", tezSessionState.getSessionId(), ex);
364+
}
365+
} else {
366+
LOG.warn("Not returning session '{}' as tez client or app id is null", tezSessionState.getSessionId());
367+
}
368+
}
369+
356370
// non default session nothing changes. The user can continue to use the existing
357371
// session in the SessionState
358372
} finally {

ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ZookeeperExternalSessionsRegistryClient.java

Lines changed: 108 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.HashSet;
2222
import java.util.Iterator;
2323
import java.util.Set;
24+
import java.util.concurrent.TimeUnit;
2425

2526
import com.google.common.annotations.VisibleForTesting;
2627
import org.apache.curator.framework.CuratorFramework;
@@ -31,9 +32,12 @@
3132
import org.apache.curator.framework.recipes.cache.CuratorCacheListener;
3233
import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent;
3334
import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener;
35+
import org.apache.curator.framework.recipes.locks.InterProcessMutex;
3436
import org.apache.curator.retry.ExponentialBackoffRetry;
3537
import org.apache.hadoop.hive.conf.HiveConf;
3638
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
39+
import org.apache.zookeeper.CreateMode;
40+
import org.apache.zookeeper.KeeperException;
3741
import org.slf4j.Logger;
3842
import org.slf4j.LoggerFactory;
3943

@@ -48,8 +52,11 @@ public class ZookeeperExternalSessionsRegistryClient implements ExternalSessions
4852
private final Set<String> taken = new HashSet<>();
4953
private final Object lock = new Object();
5054
private final int maxAttempts;
51-
55+
private CuratorFramework client;
5256
private CuratorCache cache;
57+
private CuratorCache claimsCache;
58+
private InterProcessMutex globalQueue;
59+
private String claimsPath;
5360
private volatile boolean isInitialized;
5461

5562

@@ -66,15 +73,27 @@ private void init() {
6673
String zkServer = HiveConf.getVar(initConf, ConfVars.HIVE_ZOOKEEPER_QUORUM);
6774
String zkNamespace = HiveConf.getVar(initConf, ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_NAMESPACE);
6875
String effectivePath = normalizeZkPath(zkNamespace);
69-
CuratorFramework client = CuratorFrameworkFactory.newClient(zkServer, new ExponentialBackoffRetry(1000, 3));
76+
String queuePath = effectivePath + "-queue";
77+
this.claimsPath = effectivePath + "-claims";
78+
this.client = CuratorFrameworkFactory.newClient(zkServer, new ExponentialBackoffRetry(1000, 3));
79+
7080
synchronized (lock) {
7181
client.start();
82+
this.globalQueue = new InterProcessMutex(client, queuePath);
7283
this.cache = CuratorCache.build(client, effectivePath);
7384
CuratorCacheListener listener = CuratorCacheListener.builder()
7485
.forPathChildrenCache(effectivePath, client, new ExternalSessionsPathListener())
7586
.build();
7687
cache.listenable().addListener(listener);
7788
cache.start();
89+
90+
this.claimsCache = CuratorCache.build(client, claimsPath);
91+
CuratorCacheListener claimsListener = CuratorCacheListener.builder()
92+
.forPathChildrenCache(claimsPath, client, new ClaimsPathListener())
93+
.build();
94+
claimsCache.listenable().addListener(claimsListener);
95+
claimsCache.start();
96+
7897
cache.stream()
7998
.filter(childData -> childData.getPath() != null
8099
&& childData.getPath().startsWith(effectivePath + "/"))
@@ -91,22 +110,47 @@ static String normalizeZkPath(String zkNamespace) {
91110

92111
@Override
93112
public String getSession() throws Exception {
94-
synchronized (lock) {
95-
if (!isInitialized) {
96-
init();
97-
}
98-
long endTimeNs = System.nanoTime() + (1000000000L * maxAttempts);
99-
while (available.isEmpty() && ((endTimeNs - System.nanoTime()) > 0)) {
100-
lock.wait(1000L);
113+
if (!isInitialized) {
114+
synchronized (lock) {
115+
if (!isInitialized) {
116+
init();
117+
}
101118
}
102-
Iterator<String> iter = available.iterator();
103-
if (!iter.hasNext()) {
104-
throw new IOException("Cannot get a session after " + maxAttempts + " attempts");
119+
}
120+
121+
long endTimeNs = System.nanoTime() + (1000000000L * maxAttempts);
122+
long queueWaitTimeMs = Math.max(0, (endTimeNs - System.nanoTime()) / 1000000L);
123+
if (!globalQueue.acquire(queueWaitTimeMs, TimeUnit.MILLISECONDS)) {
124+
throw new IOException("Cannot get a session (timed out in queue) after " + maxAttempts + " seconds");
125+
}
126+
127+
try {
128+
synchronized (lock) {
129+
while (System.nanoTime() < endTimeNs) {
130+
Iterator<String> iter = available.iterator();
131+
132+
while (iter.hasNext()) {
133+
String appId = iter.next();
134+
try {
135+
String claimNodePath = claimsPath + "/" + appId;
136+
client.create().creatingParentsIfNeeded().withMode(CreateMode.EPHEMERAL).forPath(claimNodePath);
137+
iter.remove();
138+
taken.add(appId);
139+
return appId;
140+
} catch (KeeperException.NodeExistsException e) {
141+
iter.remove();
142+
}
143+
}
144+
long remainingTimeNs = endTimeNs - System.nanoTime();
145+
if (remainingTimeNs > 0) {
146+
long waitTimeMs = Math.min(1000L, remainingTimeNs / 1_000_000L);
147+
lock.wait(waitTimeMs);
148+
}
149+
}
150+
throw new IOException("Cannot get a session after waiting for " + maxAttempts + " seconds (timeout exhausted)");
105151
}
106-
String appId = iter.next();
107-
iter.remove();
108-
taken.add(appId);
109-
return appId;
152+
} finally {
153+
globalQueue.release();
110154
}
111155
}
112156

@@ -117,18 +161,33 @@ public void returnSession(String appId) {
117161
throw new IllegalStateException("Not initialized");
118162
}
119163
if (!taken.remove(appId)) {
120-
return; // Session has been removed from ZK.
164+
return; // Session has already been removed from ZK.
121165
}
166+
167+
try {
168+
client.delete().guaranteed().forPath(claimsPath + "/" + appId);
169+
} catch (KeeperException.NoNodeException e) {
170+
// If the claim Node has already been deleted, we can ignore it.
171+
} catch (Exception e) {
172+
LOG.warn("Failed to delete claim node for session {}", appId, e);
173+
}
174+
122175
available.add(appId);
123176
lock.notifyAll();
124177
}
125178
}
126179

127180
@Override
128181
public void close() {
182+
if (claimsCache != null) {
183+
claimsCache.close();
184+
}
129185
if (cache != null) {
130186
cache.close();
131187
}
188+
if (client != null) {
189+
client.close();
190+
}
132191
}
133192

134193
private final class ExternalSessionsPathListener implements PathChildrenCacheListener {
@@ -148,9 +207,10 @@ public void childEvent(final CuratorFramework client, final PathChildrenCacheEve
148207
switch (event.getType()) {
149208
case CHILD_UPDATED, CHILD_ADDED:
150209
if (available.contains(applicationId) || taken.contains(applicationId)) {
151-
return; // We do not expect updates to existing sessions; ignore them for now.
210+
return;
152211
}
153212
available.add(applicationId);
213+
lock.notifyAll();
154214
break;
155215
case CHILD_REMOVED:
156216
if (taken.remove(applicationId)) {
@@ -165,4 +225,34 @@ public void childEvent(final CuratorFramework client, final PathChildrenCacheEve
165225
}
166226
}
167227
}
228+
229+
private final class ClaimsPathListener implements PathChildrenCacheListener {
230+
@Override
231+
public void childEvent(CuratorFramework client, PathChildrenCacheEvent event) {
232+
ChildData childData = event.getData();
233+
if (childData == null) {
234+
return;
235+
}
236+
237+
String applicationId = getApplicationId(childData);
238+
synchronized (lock) {
239+
switch (event.getType()) {
240+
case CHILD_REMOVED:
241+
if (!taken.contains(applicationId)) {
242+
// if the claim node was released by this particular HS2 itself,
243+
// it will be added back to the available list & locks are notified as part of returnSession()
244+
available.add(applicationId);
245+
lock.notifyAll();
246+
}
247+
break;
248+
case CHILD_ADDED:
249+
// A Tez AM was claimed by another HS2, so remove the AM from the available list of this particular HS2
250+
available.remove(applicationId);
251+
break;
252+
default:
253+
break;
254+
}
255+
}
256+
}
257+
}
168258
}

ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestZookeeperExternalSessionsRegistryClient.java

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.hadoop.hive.ql.exec.tez;
2020

2121
import static org.junit.Assert.assertEquals;
22+
import static org.junit.Assert.assertNotEquals;
2223
import static org.junit.Assert.assertNotNull;
2324
import static org.junit.Assert.assertTrue;
2425

@@ -30,6 +31,11 @@
3031
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
3132
import org.junit.Test;
3233

34+
import java.util.concurrent.ExecutorService;
35+
import java.util.concurrent.Executors;
36+
import java.util.concurrent.Future;
37+
import java.util.concurrent.TimeUnit;
38+
3339
/**
3440
* Tests for {@link ZookeeperExternalSessionsRegistryClient}.
3541
*/
@@ -128,5 +134,118 @@ public void testReuseSameSession() throws Exception {
128134
}
129135
}
130136
}
137+
138+
/**
139+
* Tests that multiple registry clients (simulating multiple HS2 instances)
140+
* respect the global distributed lock (claims) and do not claim the same session simultaneously.
141+
*/
142+
@Test
143+
public void testSessionClaimsFromDifferentRegistryClients() throws Exception {
144+
CuratorFramework client = null;
145+
ZookeeperExternalSessionsRegistryClient registry1 = null;
146+
ZookeeperExternalSessionsRegistryClient registry2 = null;
147+
148+
try (TestingServer server = new TestingServer()) {
149+
String connectString = server.getConnectString();
150+
151+
HiveConf conf = new HiveConf();
152+
conf.setVar(ConfVars.HIVE_ZOOKEEPER_QUORUM, connectString);
153+
conf.setVar(ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_NAMESPACE, "/tez_ns_concurrent");
154+
conf.setIntVar(ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_WAIT_MAX_ATTEMPTS, 5);
155+
156+
String namespace = HiveConf.getVar(conf, ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_NAMESPACE);
157+
String effectivePath = ZookeeperExternalSessionsRegistryClient.normalizeZkPath(namespace);
158+
159+
CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder();
160+
client = builder.connectString(connectString).retryPolicy(new RetryOneTime(1)).build();
161+
client.start();
162+
163+
client.create().creatingParentsIfNeeded().forPath(effectivePath + "/app_1");
164+
client.create().forPath(effectivePath + "/app_2");
165+
166+
registry1 = new ZookeeperExternalSessionsRegistryClient(conf);
167+
registry2 = new ZookeeperExternalSessionsRegistryClient(conf);
168+
169+
String sessionFromRegistry1 = registry1.getSession();
170+
String sessionFromRegistry2 = registry2.getSession();
171+
172+
assertNotNull("Registry 1 should have claimed a session", sessionFromRegistry1);
173+
assertNotNull("Registry 2 should have claimed a session", sessionFromRegistry2);
174+
175+
assertNotEquals("The two registries should claim different sessions!",
176+
sessionFromRegistry1, sessionFromRegistry2);
177+
178+
registry1.returnSession(sessionFromRegistry1);
179+
180+
String session3FromRegistry2 = registry2.getSession();
181+
assertEquals("Registry 2 should be able to claim the newly released session",
182+
sessionFromRegistry1, session3FromRegistry2);
183+
184+
registry2.returnSession(sessionFromRegistry2);
185+
registry2.returnSession(session3FromRegistry2);
186+
} finally {
187+
if (registry1 != null) {
188+
registry1.close();
189+
}
190+
if (registry2 != null) {
191+
registry2.close();
192+
}
193+
if (client != null) {
194+
client.close();
195+
}
196+
}
197+
}
198+
199+
/**
200+
* Tests that the InterProcessMutex enforces strict Global FIFO ordering.
201+
* Clients form a queue when no sessions are available, and are served in exact order.
202+
*/
203+
@Test
204+
public void testFIFOSessionClaimsFromDifferentRegistries() throws Exception {
205+
try (TestingServer server = new TestingServer()) {
206+
String connectString = server.getConnectString();
207+
208+
HiveConf conf = new HiveConf();
209+
conf.setVar(ConfVars.HIVE_ZOOKEEPER_QUORUM, connectString);
210+
conf.setVar(ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_NAMESPACE, "/tez_ns_fifo");
211+
conf.setIntVar(ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_WAIT_MAX_ATTEMPTS, 15);
212+
213+
String namespace = HiveConf.getVar(conf, ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_NAMESPACE);
214+
String effectivePath = ZookeeperExternalSessionsRegistryClient.normalizeZkPath(namespace);
215+
216+
CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder();
217+
CuratorFramework client = builder.connectString(connectString).retryPolicy(new RetryOneTime(1)).build();
218+
client.start();
219+
220+
ExecutorService executor = Executors.newFixedThreadPool(3);
221+
ZookeeperExternalSessionsRegistryClient registry1 = new ZookeeperExternalSessionsRegistryClient(conf);
222+
ZookeeperExternalSessionsRegistryClient registry2 = new ZookeeperExternalSessionsRegistryClient(conf);
223+
ZookeeperExternalSessionsRegistryClient registry3 = new ZookeeperExternalSessionsRegistryClient(conf);
224+
try {
225+
Future<String> future1 = executor.submit(registry1::getSession);
226+
Thread.sleep(500);
227+
Future<String> future2 = executor.submit(registry2::getSession);
228+
Thread.sleep(500);
229+
Future<String> future3 = executor.submit(registry3::getSession);
230+
231+
client.create().creatingParentsIfNeeded().forPath(effectivePath + "/app_first");
232+
assertEquals("Registry 1 should get the first AM", "app_first", future1.get(5, TimeUnit.SECONDS));
233+
234+
client.create().forPath(effectivePath + "/app_second");
235+
String session2 = future2.get(5, TimeUnit.SECONDS);
236+
237+
assertEquals("Registry 2 should get the second AM", "app_second", session2);
238+
registry2.returnSession(session2);
239+
240+
assertEquals("Registry 3 should get the second AM", "app_second", future3.get(5, TimeUnit.SECONDS));
241+
} finally {
242+
registry1.close();
243+
registry2.close();
244+
registry3.close();
245+
client.close();
246+
executor.shutdownNow();
247+
}
248+
}
249+
}
131250
}
132251

0 commit comments

Comments
 (0)