diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index d27a25e26..eeeee4e89 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -8,6 +8,28 @@ ### Fixed +- Reclassify transient/mis-categorized server errors so callers can identify + retryable failures. The remap is applied at all Thrift error sites + (`checkResponseForErrors`, `executeAsync`, `verifySuccessStatus`, and the + polling status handler) so the same server failure surfaces with the same + SQL state regardless of which response carries it. + - Unity Catalog unavailability (`[UC_CLIENT_EXCEPTION]`, previously `XXUCC`) + and parquet read / connection-acquisition deadlines + (`[PARQUET_FAILED_READ_FOOTER]`, `DEADLINE_EXCEEDED: acquiring connection`) + are now reported with SQL state `08S01` (communication link failure). + - Server-side `java.util.ConcurrentModificationException` is now reported + with SQL state `40001` (serialization failure) instead of the misleading + `42000`. The remap only applies when the original SQL state is `42000` so + unrelated `42xxx` states (e.g. `42501` insufficient privilege) are + preserved. + Notes for callers and operators: + - Callers branching on the legacy `XXUCC`/`42000` states for these failures + must update to `08S01`/`40001`. The driver logs the original→remapped + state at `INFO` level for traceability. + - The driver's failure telemetry uses `sqlState` as the error-name field, + so dashboards/alerts keyed on `XXUCC` or `42000` for these specific + failure modes will need to be updated to the new states. + --- *Note: When making changes, please add your change under the appropriate section with a brief description.* diff --git a/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java b/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java index 267a452fb..87c3049ee 100644 --- a/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java +++ b/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java @@ -116,6 +116,12 @@ public final class DatabricksJdbcConstants { /** Standard SQL state for communication link failure (SQLSTATE 08S01). */ public static final String COMMUNICATION_LINK_FAILURE_SQLSTATE = "08S01"; + /** + * Standard SQL state for transaction rollback - serialization failure (SQLSTATE 40001). Used for + * concurrent-modification errors where the operation is potentially retryable. + */ + public static final String SERIALIZATION_FAILURE_SQLSTATE = "40001"; + public static final int TEMPORARY_REDIRECT_STATUS_CODE = 307; public static final String REDACTED_TOKEN = "****"; public static final String QUERY_TAGS = "query_tags"; diff --git a/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java b/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java index 53a7e383b..6ea9076f4 100644 --- a/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java +++ b/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java @@ -109,7 +109,16 @@ public static void verifySuccessStatus(TStatus status, String errorContext, Stri errorMessage, sqlState, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } - throw new DatabricksHttpException(errorMessage, sqlState); + String remappedSqlState = + SqlStateClassifier.classifyTransientSqlState(status.getErrorMessage(), sqlState); + if (!Objects.equals(remappedSqlState, sqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in thrift status (context: {})", + sqlState, + remappedSqlState, + errorContext); + } + throw new DatabricksHttpException(errorMessage, remappedSqlState); } } diff --git a/src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java b/src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java new file mode 100644 index 000000000..465b058df --- /dev/null +++ b/src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java @@ -0,0 +1,57 @@ +package com.databricks.jdbc.common.util; + +import static com.databricks.jdbc.common.DatabricksJdbcConstants.COMMUNICATION_LINK_FAILURE_SQLSTATE; +import static com.databricks.jdbc.common.DatabricksJdbcConstants.SERIALIZATION_FAILURE_SQLSTATE; + +/** + * Reclassifies SQL states for known transient or mis-categorized server errors so callers can + * programmatically identify retryable failures. + * + *

Patterns handled: + * + *

+ * + *

Patterns are anchored on stable server-emitted tokens (bracketed Spark error classes, + * fully-qualified Java exception names) rather than English prose, so user-supplied SQL string + * literals cannot trigger a false remap and server message rewording does not silently regress the + * classifier. + */ +public final class SqlStateClassifier { + + private static final String SYNTAX_OR_ACCESS_VIOLATION_SQLSTATE = "42000"; + + private SqlStateClassifier() {} + + /** + * Returns a remapped SQL state if {@code errorMessage} matches a known transient pattern, or + * {@code originalSqlState} otherwise. Pure function — callers should log the remap separately + * with their own context (statement ID, response). + */ + public static String classifyTransientSqlState(String errorMessage, String originalSqlState) { + if (errorMessage == null) { + return originalSqlState; + } + // Bracketed Spark error classes are the outer cause; check before nested-Java-exception + // patterns like ConcurrentModificationException, which may appear inside a UC/Parquet + // wrapping. + if (errorMessage.contains("[UC_CLIENT_EXCEPTION]") + || errorMessage.contains("[PARQUET_FAILED_READ_FOOTER]") + || errorMessage.contains("DEADLINE_EXCEEDED: acquiring connection")) { + return COMMUNICATION_LINK_FAILURE_SQLSTATE; + } + if (SYNTAX_OR_ACCESS_VIOLATION_SQLSTATE.equals(originalSqlState) + && errorMessage.contains("java.util.ConcurrentModificationException")) { + return SERIALIZATION_FAILURE_SQLSTATE; + } + return originalSqlState; + } +} diff --git a/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java b/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java index 5981ac195..168b1a0a5 100644 --- a/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java +++ b/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java @@ -7,6 +7,7 @@ import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_RESULT_ROW_LIMIT; import static com.databricks.jdbc.common.util.DatabricksTypeUtil.DECIMAL; import static com.databricks.jdbc.common.util.DatabricksTypeUtil.getDecimalTypeString; +import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState; import static com.databricks.jdbc.dbclient.impl.sqlexec.PathConstants.*; import static com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode.TEMPORARY_REDIRECT_EXCEPTION; @@ -766,8 +767,16 @@ void handleFailedExecution( throw new DatabricksTimeoutException( errorMessage, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } + String remappedSqlState = classifyTransientSqlState(errorMessage, sqlState); + if (!Objects.equals(remappedSqlState, sqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in SEA statement [{}]", + sqlState, + remappedSqlState, + statementId); + } throw new DatabricksSQLException( - errorMessage, sqlState, DatabricksDriverErrorCode.EXECUTE_STATEMENT_FAILED); + errorMessage, remappedSqlState, DatabricksDriverErrorCode.EXECUTE_STATEMENT_FAILED); } private ExecuteStatementResponse wrapGetStatementResponse( diff --git a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java index e1916d94c..2ff8dbfca 100644 --- a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java +++ b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java @@ -5,6 +5,7 @@ import static com.databricks.jdbc.common.DatabricksJdbcConstants.QUERY_EXECUTION_TIMEOUT_SQLSTATE; import static com.databricks.jdbc.common.EnvironmentVariables.*; import static com.databricks.jdbc.common.util.DatabricksThriftUtil.*; +import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState; import com.databricks.jdbc.api.impl.*; import com.databricks.jdbc.api.internal.IDatabricksConnectionContext; @@ -29,6 +30,7 @@ import com.databricks.sdk.service.sql.StatementState; import java.sql.SQLException; import java.util.Arrays; +import java.util.Objects; import java.util.concurrent.TimeUnit; import org.apache.http.HttpException; import org.apache.thrift.TBase; @@ -379,7 +381,16 @@ DatabricksResultSet executeAsync( "Received error response {} from Thrift Server for request {}", response, request.toString()); - throw new DatabricksSQLException(response.status.errorMessage, response.status.sqlState); + String originalSqlState = response.status.sqlState; + String remappedSqlState = + classifyTransientSqlState(response.status.errorMessage, originalSqlState); + if (!Objects.equals(remappedSqlState, originalSqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in async execute response", + originalSqlState, + remappedSqlState); + } + throw new DatabricksSQLException(response.status.errorMessage, remappedSqlState); } } catch (DatabricksSQLException | TException e) { @@ -819,7 +830,16 @@ private , F extends TFieldIdEnum> void checkResponseForErr if (!response.isSet(operationHandleField) || isErrorStatusCode(status)) { // if the operationHandle has not been set, it is an error from the server. LOGGER.error("Error thrift response {}", response); - throw new DatabricksSQLException(status.getErrorMessage(), status.getSqlState()); + String originalSqlState = status.getSqlState(); + String remappedSqlState = + classifyTransientSqlState(status.getErrorMessage(), originalSqlState); + if (!Objects.equals(remappedSqlState, originalSqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in thrift response", + originalSqlState, + remappedSqlState); + } + throw new DatabricksSQLException(status.getErrorMessage(), remappedSqlState); } } @@ -840,8 +860,16 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S + "error: [%s]", statusResp.getStatus().getStatusCode(), statementId, serverError); LOGGER.error(errorMsg); - throw new DatabricksSQLException( - errorMsg, statusResp.isSetSqlState() ? statusResp.getSqlState() : null); + String originalSqlState = statusResp.isSetSqlState() ? statusResp.getSqlState() : null; + String remappedSqlState = classifyTransientSqlState(serverError, originalSqlState); + if (!Objects.equals(remappedSqlState, originalSqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in statement [{}]", + originalSqlState, + remappedSqlState, + statementId); + } + throw new DatabricksSQLException(errorMsg, remappedSqlState); } if (statusResp.isSetOperationState() @@ -864,7 +892,15 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S errorMsg, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } - throw new DatabricksSQLException(errorMsg, sqlState); + String remappedSqlState = classifyTransientSqlState(serverError, sqlState); + if (!Objects.equals(remappedSqlState, sqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in statement [{}]", + sqlState, + remappedSqlState, + statementId); + } + throw new DatabricksSQLException(errorMsg, remappedSqlState); } } diff --git a/src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java b/src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java new file mode 100644 index 000000000..2475d2516 --- /dev/null +++ b/src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java @@ -0,0 +1,102 @@ +package com.databricks.jdbc.common.util; + +import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.Test; + +class SqlStateClassifierTest { + + @Test + void unityCatalogBracketedTokenRemapsTo08S01() { + String ucMessage = + "Error running query: [UC_CLIENT_EXCEPTION] " + + "com.databricks.sql.managedcatalog.UnityCatalogClientException: " + + "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + assertEquals("08S01", classifyTransientSqlState(ucMessage, "XXUCC")); + } + + @Test + void parquetReadFooterBracketedTokenRemapsTo08S01() { + String message = + "Error running query: [PARQUET_FAILED_READ_FOOTER] " + + "com.databricks.sql.io.parquet.ParquetFailedReadFooterException: " + + "DEADLINE_EXCEEDED: acquiring connection"; + assertEquals("08S01", classifyTransientSqlState(message, null)); + } + + @Test + void deadlineExceededAcquiringConnectionRemapsTo08S01() { + assertEquals( + "08S01", + classifyTransientSqlState("DEADLINE_EXCEEDED: acquiring connection from pool", null)); + } + + @Test + void concurrentModificationGatedOn42000() { + String message = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + assertEquals("40001", classifyTransientSqlState(message, "42000")); + } + + @Test + void concurrentModificationDoesNotRemapWhenOriginalStateIsNot42000() { + String message = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + assertEquals("42501", classifyTransientSqlState(message, "42501")); + assertEquals("XXUCC", classifyTransientSqlState(message, "XXUCC")); + assertNull(classifyTransientSqlState(message, null)); + } + + @Test + void bareConcurrentModificationExceptionWithoutFqnDoesNotRemap() { + assertEquals( + "42000", + classifyTransientSqlState( + "SELECT 'ConcurrentModificationException' FROM nonexistent_tbl", "42000")); + } + + @Test + void unbracketedUcProseDoesNotTriggerRemap() { + assertEquals( + "42S02", + classifyTransientSqlState( + "User-supplied literal: Failed to contact the Unity Catalog server", "42S02")); + } + + @Test + void unrelatedErrorPreservesOriginalState() { + assertEquals( + "42S02", classifyTransientSqlState("Table or view not found: foo.bar.baz", "42S02")); + } + + @Test + void nullMessagePreservesOriginalState() { + assertNull(classifyTransientSqlState(null, null)); + assertEquals("42S02", classifyTransientSqlState(null, "42S02")); + } + + @Test + void emptyOriginalStateIsPreservedWhenUnrelated() { + assertEquals("", classifyTransientSqlState("Some unrelated error", "")); + } + + @Test + void caseSensitivityIsExplicit() { + assertEquals( + "42S02", + classifyTransientSqlState("Lowercase [uc_client_exception] should not match", "42S02")); + } + + @Test + void ucCheckRunsBeforeCmeWhenOriginalStateIs42000AndBothSubstringsPresent() { + String message = + "[UC_CLIENT_EXCEPTION] catalog server: caused by " + + "java.util.ConcurrentModificationException: nested"; + assertEquals("08S01", classifyTransientSqlState(message, "42000")); + } +} diff --git a/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java b/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java index 16f094ffd..66cd3afde 100644 --- a/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java +++ b/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java @@ -339,6 +339,38 @@ public void testHandleFailedExecution_FailedState_ThrowsWithoutHY008() throws Ex assertTrue(exception.getMessage().contains("execution failed")); } + @Test + public void testHandleFailedExecution_unityCatalogError_remapsToCommunicationLinkFailure() + throws Exception { + IDatabricksConnectionContext connectionContext = + DatabricksConnectionContext.parse(JDBC_URL, new Properties()); + DatabricksSdkClient databricksSdkClient = + new DatabricksSdkClient(connectionContext, statementExecutionService, apiClient); + + StatementStatus failedStatus = + new StatementStatus() + .setState(StatementState.FAILED) + .setSqlState("XXUCC") + .setError( + new ServiceError() + .setMessage( + "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED")); + ExecuteStatementResponse response = + new ExecuteStatementResponse() + .setStatementId(STATEMENT_ID.toSQLExecStatementId()) + .setStatus(failedStatus); + + DatabricksSQLException exception = + assertThrows( + DatabricksSQLException.class, + () -> + databricksSdkClient.handleFailedExecution( + response, STATEMENT_ID.toSQLExecStatementId(), STATEMENT)); + + assertEquals("08S01", exception.getSQLState(), "Expected XXUCC to be remapped to 08S01"); + } + @Test public void testGetStatementResult_CancelledState_ThrowsWithHY008() throws Exception { IDatabricksConnectionContext connectionContext = diff --git a/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java b/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java index 79b93e081..77b279cfb 100644 --- a/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java +++ b/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java @@ -1225,6 +1225,128 @@ void testMetadataPollingWithSleepBetweenPolls() assertTrue(elapsed >= 150, "Expected at least 150ms elapsed due to poll sleep, got " + elapsed); } + @Test + void testExecute_remapsUcErrorOnStatusCodeBranchToCommunicationLinkFailure() + throws TException, SQLException, DatabricksValidationException { + setup(true); + TExecuteStatementReq request = new TExecuteStatementReq(); + TExecuteStatementResp tExecuteStatementResp = + new TExecuteStatementResp() + .setOperationHandle(tOperationHandle) + .setStatus(new TStatus().setStatusCode(TStatusCode.SUCCESS_STATUS)); + + String ucErrorMessage = + "Error running query: [UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + // ERROR_STATUS triggers the status-code branch in checkOperationStatusForErrors first. + TGetOperationStatusResp ucErrorResp = + new TGetOperationStatusResp() + .setStatus( + new TStatus() + .setStatusCode(TStatusCode.ERROR_STATUS) + .setErrorMessage(ucErrorMessage) + .setSqlState("XXUCC")) + .setSqlState("XXUCC") + .setOperationState(TOperationState.ERROR_STATE); + + when(thriftClient.ExecuteStatement(request)).thenReturn(tExecuteStatementResp); + when(thriftClient.GetOperationStatus(any(TGetOperationStatusReq.class))) + .thenReturn(ucErrorResp); + Statement statement = mock(Statement.class); + when(parentStatement.getStatement()).thenReturn(statement); + when(statement.getQueryTimeout()).thenReturn(0); + + DatabricksSQLException e = + assertThrows( + DatabricksSQLException.class, + () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); + assertEquals("08S01", e.getSQLState(), "Expected UC error to be remapped to 08S01"); + assertNotEquals("XXUCC", e.getSQLState(), "Expected XXUCC to have been remapped"); + assertTrue(e.getMessage().contains("UC_CLIENT_EXCEPTION")); + } + + @Test + void testExecute_remapsUcErrorOnOperationStateBranchToCommunicationLinkFailure() + throws TException, SQLException, DatabricksValidationException { + setup(true); + TExecuteStatementReq request = new TExecuteStatementReq(); + TExecuteStatementResp tExecuteStatementResp = + new TExecuteStatementResp() + .setOperationHandle(tOperationHandle) + .setStatus(new TStatus().setStatusCode(TStatusCode.SUCCESS_STATUS)); + + String ucErrorMessage = + "Error running query: [UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + // SUCCESS_STATUS on TStatus skips the status-code branch and falls through to the + // operation-state branch (the second classifier call site in checkOperationStatusForErrors). + TGetOperationStatusResp ucErrorResp = + new TGetOperationStatusResp() + .setStatus( + new TStatus() + .setStatusCode(TStatusCode.SUCCESS_STATUS) + .setErrorMessage(ucErrorMessage) + .setSqlState("XXUCC")) + .setSqlState("XXUCC") + .setOperationState(TOperationState.ERROR_STATE); + + when(thriftClient.ExecuteStatement(request)).thenReturn(tExecuteStatementResp); + when(thriftClient.GetOperationStatus(any(TGetOperationStatusReq.class))) + .thenReturn(ucErrorResp); + Statement statement = mock(Statement.class); + when(parentStatement.getStatement()).thenReturn(statement); + when(statement.getQueryTimeout()).thenReturn(0); + + DatabricksSQLException e = + assertThrows( + DatabricksSQLException.class, + () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); + assertEquals( + "08S01", + e.getSQLState(), + "Expected UC error on operation-state branch to be remapped to 08S01"); + } + + @Test + void testExecute_remapsConcurrentModificationOnOperationStateBranchToSerializationFailure() + throws TException, SQLException, DatabricksValidationException { + setup(true); + TExecuteStatementReq request = new TExecuteStatementReq(); + TExecuteStatementResp tExecuteStatementResp = + new TExecuteStatementResp() + .setOperationHandle(tOperationHandle) + .setStatus(new TStatus().setStatusCode(TStatusCode.SUCCESS_STATUS)); + + String cmeErrorMessage = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + TGetOperationStatusResp cmeErrorResp = + new TGetOperationStatusResp() + .setStatus( + new TStatus() + .setStatusCode(TStatusCode.SUCCESS_STATUS) + .setErrorMessage(cmeErrorMessage) + .setSqlState("42000")) + .setSqlState("42000") + .setOperationState(TOperationState.ERROR_STATE); + + when(thriftClient.ExecuteStatement(request)).thenReturn(tExecuteStatementResp); + when(thriftClient.GetOperationStatus(any(TGetOperationStatusReq.class))) + .thenReturn(cmeErrorResp); + Statement statement = mock(Statement.class); + when(parentStatement.getStatement()).thenReturn(statement); + when(statement.getQueryTimeout()).thenReturn(0); + + DatabricksSQLException e = + assertThrows( + DatabricksSQLException.class, + () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); + assertEquals( + "40001", + e.getSQLState(), + "Expected ConcurrentModificationException with 42000 to be remapped to 40001"); + } + private TFetchResultsReq getFetchResultsRequest(boolean includeMetadata) throws DatabricksValidationException { TFetchResultsReq request =