|
19 | 19 | package org.apache.hadoop.hive.ql.exec.tez; |
20 | 20 |
|
21 | 21 | import java.io.IOException; |
| 22 | +import java.util.concurrent.TimeUnit; |
22 | 23 |
|
23 | 24 | import org.apache.hadoop.conf.Configuration; |
24 | 25 | import org.apache.hadoop.hive.conf.HiveConf; |
| 26 | +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; |
25 | 27 | import org.apache.hadoop.hive.ql.exec.tez.monitoring.TezJobMonitor; |
26 | 28 | import org.apache.hadoop.hive.ql.metadata.HiveException; |
27 | 29 | import org.apache.hadoop.hive.ql.session.SessionState; |
|
35 | 37 | import org.apache.tez.dag.api.client.DAGClient; |
36 | 38 | import org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolBlockingPB; |
37 | 39 | import org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolRPC; |
| 40 | +import org.apache.tez.dag.api.records.DAGProtos; |
38 | 41 | import org.apache.tez.serviceplugins.api.ServicePluginsDescriptor; |
39 | 42 |
|
40 | 43 | /** |
@@ -211,15 +214,47 @@ private void tryKillRunningDAGs(TezClient session) throws TezException { |
211 | 214 | if (proxy == null) { |
212 | 215 | throw new TezException("Error while trying to connect to AM for app ID " + externalAppId); |
213 | 216 | } |
| 217 | + long killTimeoutMs = TimeUnit.SECONDS.toMillis( |
| 218 | + HiveConf.getIntVar(conf, ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_WAIT_MAX_ATTEMPTS)); |
214 | 219 | try { |
215 | 220 | DAGClientAMProtocolRPC.GetAllDAGsResponseProto allDAGSResponse = |
216 | 221 | proxy.getAllDAGs(null, DAGClientAMProtocolRPC.GetAllDAGsRequestProto.newBuilder().build()); |
217 | 222 | for (String dagId : allDAGSResponse.getDagIdList()) { |
218 | 223 | LOG.info("External session: attempting to kill dagId {} on app ID {}", dagId, externalAppId); |
219 | 224 | proxy.tryKillDAG(null, DAGClientAMProtocolRPC.TryKillDAGRequestProto.newBuilder().setDagId(dagId).build()); |
| 225 | + waitForDagTerminal(proxy, dagId, killTimeoutMs); |
220 | 226 | } |
221 | 227 | } catch (Exception e) { |
222 | 228 | throw new TezException("Error while trying to kill existing DAG running on app ID " + externalAppId, e); |
223 | 229 | } |
224 | 230 | } |
| 231 | + |
| 232 | + private void waitForDagTerminal(DAGClientAMProtocolBlockingPB proxy, String dagId, long timeoutMs) |
| 233 | + throws Exception { |
| 234 | + long startTimeMs = System.currentTimeMillis(); |
| 235 | + long pollIntervalMs = conf.getTimeVar(ConfVars.TEZ_DAG_STATUS_CHECK_INTERVAL, TimeUnit.MILLISECONDS); |
| 236 | + while (System.currentTimeMillis() - startTimeMs < timeoutMs) { |
| 237 | + long remainingMs = timeoutMs - (System.currentTimeMillis() - startTimeMs); |
| 238 | + DAGClientAMProtocolRPC.GetDAGStatusResponseProto response = proxy.getDAGStatus(null, |
| 239 | + DAGClientAMProtocolRPC.GetDAGStatusRequestProto.newBuilder() |
| 240 | + .setDagId(dagId) |
| 241 | + .setTimeout(Math.min(pollIntervalMs, remainingMs)) |
| 242 | + .build()); |
| 243 | + if (response.hasDagStatus() && response.getDagStatus().hasState() |
| 244 | + && isTerminalDagState(response.getDagStatus().getState())) { |
| 245 | + LOG.info("External session: dagId {} on app ID {} reached terminal state {}", dagId, externalAppId, |
| 246 | + response.getDagStatus().getState()); |
| 247 | + return; |
| 248 | + } |
| 249 | + } |
| 250 | + throw new TezException("Timed out after " + timeoutMs + " ms waiting for orphan DAG " + dagId |
| 251 | + + " on app ID " + externalAppId + " to reach terminal state after kill"); |
| 252 | + } |
| 253 | + |
| 254 | + private static boolean isTerminalDagState(DAGProtos.DAGStatusStateProto state) { |
| 255 | + return switch (state) { |
| 256 | + case DAG_SUCCEEDED, DAG_KILLED, DAG_FAILED, DAG_ERROR -> true; |
| 257 | + default -> false; |
| 258 | + }; |
| 259 | + } |
225 | 260 | } |
0 commit comments