|
19 | 19 | package org.apache.hadoop.hive.ql.exec.tez; |
20 | 20 |
|
21 | 21 | import java.io.IOException; |
| 22 | +import java.util.concurrent.TimeUnit; |
22 | 23 |
|
| 24 | +import com.google.protobuf.ServiceException; |
23 | 25 | import org.apache.hadoop.conf.Configuration; |
24 | 26 | import org.apache.hadoop.hive.conf.HiveConf; |
| 27 | +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; |
25 | 28 | import org.apache.hadoop.hive.ql.exec.tez.monitoring.TezJobMonitor; |
26 | 29 | import org.apache.hadoop.hive.ql.metadata.HiveException; |
| 30 | +import org.apache.hadoop.hive.ql.session.SessionState; |
27 | 31 | import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; |
28 | 32 | import org.apache.hadoop.security.Credentials; |
29 | 33 | import org.apache.hadoop.yarn.api.records.ApplicationId; |
30 | 34 | import org.apache.tez.client.TezClient; |
| 35 | +import org.apache.tez.dag.api.DAG; |
31 | 36 | import org.apache.tez.dag.api.TezConfiguration; |
32 | 37 | import org.apache.tez.dag.api.TezException; |
| 38 | +import org.apache.tez.dag.api.client.DAGClient; |
| 39 | +import org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolBlockingPB; |
| 40 | +import org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolRPC; |
| 41 | +import org.apache.tez.dag.api.records.DAGProtos; |
33 | 42 | import org.apache.tez.serviceplugins.api.ServicePluginsDescriptor; |
34 | 43 |
|
35 | 44 | /** |
@@ -139,6 +148,11 @@ public void close(boolean keepDagFilesDir) throws Exception { |
139 | 148 | // We never close external sessions that don't have errors. |
140 | 149 | try { |
141 | 150 | if (externalAppId != null) { |
| 151 | + LOG.debug("Returning external session with appID: {}", externalAppId); |
| 152 | + SessionState sessionState = SessionState.get(); |
| 153 | + if (sessionState != null) { |
| 154 | + sessionState.setTezSession(null); |
| 155 | + } |
142 | 156 | registry.returnSession(externalAppId); |
143 | 157 | } |
144 | 158 | } catch (Exception e) { |
@@ -187,4 +201,73 @@ public boolean killQuery(String reason) throws HiveException { |
187 | 201 | killQuery.killQuery(queryId, reason, conf, false); |
188 | 202 | return true; |
189 | 203 | } |
| 204 | + |
| 205 | + @Override |
| 206 | + public DAGClient submitDAG(DAG dag) throws TezException, IOException { |
| 207 | + if (!registry.isClaimed(externalAppId)) { |
| 208 | + throw new TezException("Cannot submit DAG as the Tez Session no-longer owns the AM: " + externalAppId); |
| 209 | + } |
| 210 | + try { |
| 211 | + return getTezClient().submitDAG(dag); |
| 212 | + } catch (TezException e) { |
| 213 | + if (e.getMessage() == null || !e.getMessage().contains("App master already running a DAG")) { |
| 214 | + throw e; |
| 215 | + } |
| 216 | + tryKillRunningDAGs(getTezClient()); |
| 217 | + return getTezClient().submitDAG(dag); |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + private void tryKillRunningDAGs(TezClient session) throws TezException { |
| 222 | + if (!registry.isClaimed(externalAppId)) { |
| 223 | + throw new TezException("Cannot kill running DAG as the Tez Session no-longer owns the AM: " + externalAppId); |
| 224 | + } |
| 225 | + LOG.info("External session has an AM which is already running a DAG on app ID {}", externalAppId); |
| 226 | + DAGClientAMProtocolBlockingPB proxy = session.sendAMHeartbeat(null); |
| 227 | + if (proxy == null) { |
| 228 | + throw new TezException("Error while trying to connect to AM for app ID " + externalAppId); |
| 229 | + } |
| 230 | + long killTimeoutMs = TimeUnit.SECONDS.toMillis( |
| 231 | + HiveConf.getIntVar(conf, ConfVars.HIVE_SERVER2_TEZ_EXTERNAL_SESSIONS_WAIT_MAX_ATTEMPTS)); |
| 232 | + try { |
| 233 | + DAGClientAMProtocolRPC.GetAllDAGsResponseProto allDAGSResponse = |
| 234 | + proxy.getAllDAGs(null, DAGClientAMProtocolRPC.GetAllDAGsRequestProto.newBuilder().build()); |
| 235 | + for (String dagId : allDAGSResponse.getDagIdList()) { |
| 236 | + LOG.info("External session: attempting to kill dagId {} on app ID {}", dagId, externalAppId); |
| 237 | + proxy.tryKillDAG(null, DAGClientAMProtocolRPC.TryKillDAGRequestProto.newBuilder().setDagId(dagId).build()); |
| 238 | + waitForDagTerminal(proxy, dagId, killTimeoutMs); |
| 239 | + } |
| 240 | + } catch (Exception e) { |
| 241 | + throw new TezException("Error while trying to kill existing DAG running on app ID " + externalAppId, e); |
| 242 | + } |
| 243 | + } |
| 244 | + |
| 245 | + private void waitForDagTerminal(DAGClientAMProtocolBlockingPB proxy, String dagId, long timeoutMs) |
| 246 | + throws TezException, ServiceException { |
| 247 | + long startTimeMs = System.currentTimeMillis(); |
| 248 | + long pollIntervalMs = conf.getTimeVar(ConfVars.TEZ_DAG_STATUS_CHECK_INTERVAL, TimeUnit.MILLISECONDS); |
| 249 | + while (System.currentTimeMillis() - startTimeMs < timeoutMs) { |
| 250 | + long remainingMs = timeoutMs - (System.currentTimeMillis() - startTimeMs); |
| 251 | + DAGClientAMProtocolRPC.GetDAGStatusResponseProto response = proxy.getDAGStatus(null, |
| 252 | + DAGClientAMProtocolRPC.GetDAGStatusRequestProto.newBuilder() |
| 253 | + .setDagId(dagId) |
| 254 | + .setTimeout(Math.min(pollIntervalMs, remainingMs)) |
| 255 | + .build()); |
| 256 | + if (response.hasDagStatus() && response.getDagStatus().hasState() |
| 257 | + && isTerminalDagState(response.getDagStatus().getState())) { |
| 258 | + LOG.info("External session: dagId {} on app ID {} reached terminal state {}", dagId, externalAppId, |
| 259 | + response.getDagStatus().getState()); |
| 260 | + return; |
| 261 | + } |
| 262 | + } |
| 263 | + throw new TezException("Timed out after " + timeoutMs + " ms waiting for orphan DAG " + dagId |
| 264 | + + " on app ID " + externalAppId + " to reach terminal state after kill"); |
| 265 | + } |
| 266 | + |
| 267 | + private static boolean isTerminalDagState(DAGProtos.DAGStatusStateProto state) { |
| 268 | + return switch (state) { |
| 269 | + case DAG_SUCCEEDED, DAG_KILLED, DAG_FAILED, DAG_ERROR -> true; |
| 270 | + default -> false; |
| 271 | + }; |
| 272 | + } |
190 | 273 | } |
0 commit comments