From d4a936751da20196f82aea9ba6af81b505cc8c17 Mon Sep 17 00:00:00 2001 From: samikshya-chand_data Date: Tue, 28 Apr 2026 15:40:36 +0530 Subject: [PATCH 1/3] Classify transient/mis-categorized server errors with retryable SQL states Several DatabricksSQLException categorizations were either missing a SQL state or assigning one that did not reflect a retryable transient failure, making it impossible for callers to identify retryable errors at runtime. Reclassify three patterns in checkOperationStatusForErrors via a new classifyTransientSqlState helper: - Unity Catalog unavailability (UC_CLIENT_EXCEPTION / "Failed to contact the Unity Catalog server", currently undocumented XXUCC) -> 08S01 (communication link failure, retryable). - Connection-acquisition / parquet read deadlines (PARQUET_FAILED_READ_FOOTER, "DEADLINE_EXCEEDED: acquiring connection", currently no SQL state) -> 08S01. - Server-side ConcurrentModificationException currently mis-mapped to 42000 (syntax/access violation) -> 40001 (serialization failure, retryable concurrency error). Adds unit tests for the classifier plus an end-to-end test verifying the SQL state is reassigned through the polling pipeline. Co-authored-by: Isaac Signed-off-by: samikshya-chand_data --- NEXT_CHANGELOG.md | 8 ++ .../jdbc/common/DatabricksJdbcConstants.java | 6 ++ .../impl/thrift/DatabricksThriftAccessor.java | 38 +++++++++- .../thrift/DatabricksThriftAccessorTest.java | 74 +++++++++++++++++++ 4 files changed, 124 insertions(+), 2 deletions(-) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index d27a25e263..e1bc32dba3 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -8,6 +8,14 @@ ### Fixed +- Reclassify transient/mis-categorized server errors so callers can identify + retryable failures: Unity Catalog unavailability (`UC_CLIENT_EXCEPTION` / + `XXUCC`) and parquet read / connection-acquisition deadlines + (`PARQUET_FAILED_READ_FOOTER`, `DEADLINE_EXCEEDED: acquiring connection`) + are now reported with SQL state `08S01` (communication link failure). + Server-side `ConcurrentModificationException` is now reported with SQL state + `40001` (serialization failure) instead of the misleading `42000`. + --- *Note: When making changes, please add your change under the appropriate section with a brief description.* diff --git a/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java b/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java index 267a452fb5..87c3049eea 100644 --- a/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java +++ b/src/main/java/com/databricks/jdbc/common/DatabricksJdbcConstants.java @@ -116,6 +116,12 @@ public final class DatabricksJdbcConstants { /** Standard SQL state for communication link failure (SQLSTATE 08S01). */ public static final String COMMUNICATION_LINK_FAILURE_SQLSTATE = "08S01"; + /** + * Standard SQL state for transaction rollback - serialization failure (SQLSTATE 40001). Used for + * concurrent-modification errors where the operation is potentially retryable. + */ + public static final String SERIALIZATION_FAILURE_SQLSTATE = "40001"; + public static final int TEMPORARY_REDIRECT_STATUS_CODE = 307; public static final String REDACTED_TOKEN = "****"; public static final String QUERY_TAGS = "query_tags"; diff --git a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java index e1916d94c4..b99b0981d2 100644 --- a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java +++ b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java @@ -3,6 +3,7 @@ import static com.databricks.jdbc.common.DatabricksJdbcConstants.COMMUNICATION_LINK_FAILURE_SQLSTATE; import static com.databricks.jdbc.common.DatabricksJdbcConstants.OPERATION_CANCELLED_SQLSTATE; import static com.databricks.jdbc.common.DatabricksJdbcConstants.QUERY_EXECUTION_TIMEOUT_SQLSTATE; +import static com.databricks.jdbc.common.DatabricksJdbcConstants.SERIALIZATION_FAILURE_SQLSTATE; import static com.databricks.jdbc.common.EnvironmentVariables.*; import static com.databricks.jdbc.common.util.DatabricksThriftUtil.*; @@ -840,8 +841,9 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S + "error: [%s]", statusResp.getStatus().getStatusCode(), statementId, serverError); LOGGER.error(errorMsg); + String originalSqlState = statusResp.isSetSqlState() ? statusResp.getSqlState() : null; throw new DatabricksSQLException( - errorMsg, statusResp.isSetSqlState() ? statusResp.getSqlState() : null); + errorMsg, classifyTransientSqlState(serverError, originalSqlState)); } if (statusResp.isSetOperationState() @@ -864,10 +866,42 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S errorMsg, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } - throw new DatabricksSQLException(errorMsg, sqlState); + throw new DatabricksSQLException(errorMsg, classifyTransientSqlState(serverError, sqlState)); } } + /** + * Reclassifies the SQL state for known transient or mis-categorized server errors so callers can + * programmatically identify retryable failures. Returns {@code originalSqlState} when no known + * pattern matches. + * + *

Patterns handled: + * + *

+ */ + static String classifyTransientSqlState(String errorMessage, String originalSqlState) { + if (errorMessage == null) { + return originalSqlState; + } + if (errorMessage.contains("ConcurrentModificationException")) { + return SERIALIZATION_FAILURE_SQLSTATE; + } + if (errorMessage.contains("UC_CLIENT_EXCEPTION") + || errorMessage.contains("Failed to contact the Unity Catalog server") + || errorMessage.contains("PARQUET_FAILED_READ_FOOTER") + || errorMessage.contains("DEADLINE_EXCEEDED: acquiring connection")) { + return COMMUNICATION_LINK_FAILURE_SQLSTATE; + } + return originalSqlState; + } + /** * Enriches a null or empty error message from TStatus by including errorCode, errorDetailsJson, * and infoMessages. Returns the original errorMessage if it is already present. diff --git a/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java b/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java index 79b93e0812..7dee69b1a1 100644 --- a/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java +++ b/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java @@ -1225,6 +1225,80 @@ void testMetadataPollingWithSleepBetweenPolls() assertTrue(elapsed >= 150, "Expected at least 150ms elapsed due to poll sleep, got " + elapsed); } + @Test + void classifyTransientSqlState_unityCatalogUnavailable_remapsXxuccTo08S01() { + String ucMessage = + "Error running query: [UC_CLIENT_EXCEPTION] " + + "com.databricks.sql.managedcatalog.UnityCatalogClientException: " + + "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + assertEquals("08S01", DatabricksThriftAccessor.classifyTransientSqlState(ucMessage, "XXUCC")); + } + + @Test + void classifyTransientSqlState_concurrentModification_remaps42000To40001() { + String message = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + assertEquals("40001", DatabricksThriftAccessor.classifyTransientSqlState(message, "42000")); + } + + @Test + void classifyTransientSqlState_parquetReadFooterDeadline_assigns08S01WhenNullState() { + String message = + "Error running query: [PARQUET_FAILED_READ_FOOTER] " + + "com.databricks.sql.io.parquet.ParquetFailedReadFooterException: " + + "DEADLINE_EXCEEDED: acquiring connection"; + assertEquals("08S01", DatabricksThriftAccessor.classifyTransientSqlState(message, null)); + } + + @Test + void classifyTransientSqlState_unrelatedError_preservesOriginalState() { + assertEquals( + "42S02", + DatabricksThriftAccessor.classifyTransientSqlState( + "Table or view not found: foo.bar.baz", "42S02")); + assertNull(DatabricksThriftAccessor.classifyTransientSqlState(null, null)); + } + + @Test + void testExecute_remapsUnityCatalogErrorToCommunicationLinkFailure() + throws TException, SQLException, DatabricksValidationException { + setup(true); + TExecuteStatementReq request = new TExecuteStatementReq(); + TExecuteStatementResp tExecuteStatementResp = + new TExecuteStatementResp() + .setOperationHandle(tOperationHandle) + .setStatus(new TStatus().setStatusCode(TStatusCode.SUCCESS_STATUS)); + + String ucErrorMessage = + "Error running query: [UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + TGetOperationStatusResp ucErrorResp = + new TGetOperationStatusResp() + .setStatus( + new TStatus() + .setStatusCode(TStatusCode.ERROR_STATUS) + .setErrorMessage(ucErrorMessage) + .setSqlState("XXUCC")) + .setSqlState("XXUCC") + .setOperationState(TOperationState.ERROR_STATE); + + when(thriftClient.ExecuteStatement(request)).thenReturn(tExecuteStatementResp); + when(thriftClient.GetOperationStatus(any(TGetOperationStatusReq.class))) + .thenReturn(ucErrorResp); + Statement statement = mock(Statement.class); + when(parentStatement.getStatement()).thenReturn(statement); + when(statement.getQueryTimeout()).thenReturn(0); + + DatabricksSQLException e = + assertThrows( + DatabricksSQLException.class, + () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); + assertEquals("08S01", e.getSQLState(), "Expected UC error to be remapped to 08S01"); + assertTrue(e.getMessage().contains("UC_CLIENT_EXCEPTION")); + } + private TFetchResultsReq getFetchResultsRequest(boolean includeMetadata) throws DatabricksValidationException { TFetchResultsReq request = From b7eeecac2f92cc356d2d968183b6eae1b5bc6a05 Mon Sep 17 00:00:00 2001 From: samikshya-chand_data Date: Tue, 28 Apr 2026 15:12:39 +0000 Subject: [PATCH 2/3] Address review feedback: harden classifier and apply at all Thrift error sites Promotes classifyTransientSqlState to a shared SqlStateClassifier helper and calls it from every Thrift error site (executeAsync, checkResponseForErrors, verifySuccessStatus, and both branches of checkOperationStatusForErrors) so the same server failure surfaces with the same SQL state regardless of which response carries it. Tightens patterns to anchor on stable server-emitted tokens ([UC_CLIENT_EXCEPTION], [PARQUET_FAILED_READ_FOOTER], java.util.ConcurrentModificationException) instead of English prose, so user SQL string literals cannot trigger a false remap. Gates the ConcurrentModificationException -> 40001 remap on originalSqlState == 42000 to preserve unrelated 42xxx states (e.g. 42501 insufficient privilege). Logs the original -> remapped state at INFO at each remap site so support engineers can recover the original state from logs. Replaces helper-coupled unit tests with a dedicated SqlStateClassifierTest and adds end-to-end tests for the operation-state branch and the CME path in DatabricksThriftAccessorTest. Updates NEXT_CHANGELOG.md to call out (a) callers depending on XXUCC/42000 must update and (b) the failure-telemetry error-name bucket migration. Co-authored-by: Isaac Signed-off-by: samikshya-chand_data --- NEXT_CHANGELOG.md | 26 +++- .../common/util/DatabricksThriftUtil.java | 11 +- .../jdbc/common/util/SqlStateClassifier.java | 57 +++++++++ .../impl/thrift/DatabricksThriftAccessor.java | 78 ++++++------ .../common/util/SqlStateClassifierTest.java | 102 +++++++++++++++ .../thrift/DatabricksThriftAccessorTest.java | 116 +++++++++++++----- 6 files changed, 311 insertions(+), 79 deletions(-) create mode 100644 src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java create mode 100644 src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index e1bc32dba3..eeeee4e897 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -9,12 +9,26 @@ ### Fixed - Reclassify transient/mis-categorized server errors so callers can identify - retryable failures: Unity Catalog unavailability (`UC_CLIENT_EXCEPTION` / - `XXUCC`) and parquet read / connection-acquisition deadlines - (`PARQUET_FAILED_READ_FOOTER`, `DEADLINE_EXCEEDED: acquiring connection`) - are now reported with SQL state `08S01` (communication link failure). - Server-side `ConcurrentModificationException` is now reported with SQL state - `40001` (serialization failure) instead of the misleading `42000`. + retryable failures. The remap is applied at all Thrift error sites + (`checkResponseForErrors`, `executeAsync`, `verifySuccessStatus`, and the + polling status handler) so the same server failure surfaces with the same + SQL state regardless of which response carries it. + - Unity Catalog unavailability (`[UC_CLIENT_EXCEPTION]`, previously `XXUCC`) + and parquet read / connection-acquisition deadlines + (`[PARQUET_FAILED_READ_FOOTER]`, `DEADLINE_EXCEEDED: acquiring connection`) + are now reported with SQL state `08S01` (communication link failure). + - Server-side `java.util.ConcurrentModificationException` is now reported + with SQL state `40001` (serialization failure) instead of the misleading + `42000`. The remap only applies when the original SQL state is `42000` so + unrelated `42xxx` states (e.g. `42501` insufficient privilege) are + preserved. + Notes for callers and operators: + - Callers branching on the legacy `XXUCC`/`42000` states for these failures + must update to `08S01`/`40001`. The driver logs the original→remapped + state at `INFO` level for traceability. + - The driver's failure telemetry uses `sqlState` as the error-name field, + so dashboards/alerts keyed on `XXUCC` or `42000` for these specific + failure modes will need to be updated to the new states. --- *Note: When making changes, please add your change under the appropriate section diff --git a/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java b/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java index 53a7e383bb..6ea9076f44 100644 --- a/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java +++ b/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java @@ -109,7 +109,16 @@ public static void verifySuccessStatus(TStatus status, String errorContext, Stri errorMessage, sqlState, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } - throw new DatabricksHttpException(errorMessage, sqlState); + String remappedSqlState = + SqlStateClassifier.classifyTransientSqlState(status.getErrorMessage(), sqlState); + if (!Objects.equals(remappedSqlState, sqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in thrift status (context: {})", + sqlState, + remappedSqlState, + errorContext); + } + throw new DatabricksHttpException(errorMessage, remappedSqlState); } } diff --git a/src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java b/src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java new file mode 100644 index 0000000000..465b058df5 --- /dev/null +++ b/src/main/java/com/databricks/jdbc/common/util/SqlStateClassifier.java @@ -0,0 +1,57 @@ +package com.databricks.jdbc.common.util; + +import static com.databricks.jdbc.common.DatabricksJdbcConstants.COMMUNICATION_LINK_FAILURE_SQLSTATE; +import static com.databricks.jdbc.common.DatabricksJdbcConstants.SERIALIZATION_FAILURE_SQLSTATE; + +/** + * Reclassifies SQL states for known transient or mis-categorized server errors so callers can + * programmatically identify retryable failures. + * + *

Patterns handled: + * + *

    + *
  • Unity Catalog unavailability ({@code [UC_CLIENT_EXCEPTION]}) → {@code 08S01} (communication + * link failure, retryable). + *
  • Connection-acquisition / parquet read deadlines ({@code [PARQUET_FAILED_READ_FOOTER]}, + * {@code DEADLINE_EXCEEDED: acquiring connection}) → {@code 08S01}. + *
  • Server-side {@code java.util.ConcurrentModificationException} mis-mapped to {@code 42000} + * (syntax/access violation) → {@code 40001} (serialization failure, retryable). Only applied + * when the original SQL state is {@code 42000} so unrelated {@code 42xxx} states are + * preserved. + *
+ * + *

Patterns are anchored on stable server-emitted tokens (bracketed Spark error classes, + * fully-qualified Java exception names) rather than English prose, so user-supplied SQL string + * literals cannot trigger a false remap and server message rewording does not silently regress the + * classifier. + */ +public final class SqlStateClassifier { + + private static final String SYNTAX_OR_ACCESS_VIOLATION_SQLSTATE = "42000"; + + private SqlStateClassifier() {} + + /** + * Returns a remapped SQL state if {@code errorMessage} matches a known transient pattern, or + * {@code originalSqlState} otherwise. Pure function — callers should log the remap separately + * with their own context (statement ID, response). + */ + public static String classifyTransientSqlState(String errorMessage, String originalSqlState) { + if (errorMessage == null) { + return originalSqlState; + } + // Bracketed Spark error classes are the outer cause; check before nested-Java-exception + // patterns like ConcurrentModificationException, which may appear inside a UC/Parquet + // wrapping. + if (errorMessage.contains("[UC_CLIENT_EXCEPTION]") + || errorMessage.contains("[PARQUET_FAILED_READ_FOOTER]") + || errorMessage.contains("DEADLINE_EXCEEDED: acquiring connection")) { + return COMMUNICATION_LINK_FAILURE_SQLSTATE; + } + if (SYNTAX_OR_ACCESS_VIOLATION_SQLSTATE.equals(originalSqlState) + && errorMessage.contains("java.util.ConcurrentModificationException")) { + return SERIALIZATION_FAILURE_SQLSTATE; + } + return originalSqlState; + } +} diff --git a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java index b99b0981d2..2ff8dbfcaa 100644 --- a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java +++ b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java @@ -3,9 +3,9 @@ import static com.databricks.jdbc.common.DatabricksJdbcConstants.COMMUNICATION_LINK_FAILURE_SQLSTATE; import static com.databricks.jdbc.common.DatabricksJdbcConstants.OPERATION_CANCELLED_SQLSTATE; import static com.databricks.jdbc.common.DatabricksJdbcConstants.QUERY_EXECUTION_TIMEOUT_SQLSTATE; -import static com.databricks.jdbc.common.DatabricksJdbcConstants.SERIALIZATION_FAILURE_SQLSTATE; import static com.databricks.jdbc.common.EnvironmentVariables.*; import static com.databricks.jdbc.common.util.DatabricksThriftUtil.*; +import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState; import com.databricks.jdbc.api.impl.*; import com.databricks.jdbc.api.internal.IDatabricksConnectionContext; @@ -30,6 +30,7 @@ import com.databricks.sdk.service.sql.StatementState; import java.sql.SQLException; import java.util.Arrays; +import java.util.Objects; import java.util.concurrent.TimeUnit; import org.apache.http.HttpException; import org.apache.thrift.TBase; @@ -380,7 +381,16 @@ DatabricksResultSet executeAsync( "Received error response {} from Thrift Server for request {}", response, request.toString()); - throw new DatabricksSQLException(response.status.errorMessage, response.status.sqlState); + String originalSqlState = response.status.sqlState; + String remappedSqlState = + classifyTransientSqlState(response.status.errorMessage, originalSqlState); + if (!Objects.equals(remappedSqlState, originalSqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in async execute response", + originalSqlState, + remappedSqlState); + } + throw new DatabricksSQLException(response.status.errorMessage, remappedSqlState); } } catch (DatabricksSQLException | TException e) { @@ -820,7 +830,16 @@ private , F extends TFieldIdEnum> void checkResponseForErr if (!response.isSet(operationHandleField) || isErrorStatusCode(status)) { // if the operationHandle has not been set, it is an error from the server. LOGGER.error("Error thrift response {}", response); - throw new DatabricksSQLException(status.getErrorMessage(), status.getSqlState()); + String originalSqlState = status.getSqlState(); + String remappedSqlState = + classifyTransientSqlState(status.getErrorMessage(), originalSqlState); + if (!Objects.equals(remappedSqlState, originalSqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in thrift response", + originalSqlState, + remappedSqlState); + } + throw new DatabricksSQLException(status.getErrorMessage(), remappedSqlState); } } @@ -842,8 +861,15 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S statusResp.getStatus().getStatusCode(), statementId, serverError); LOGGER.error(errorMsg); String originalSqlState = statusResp.isSetSqlState() ? statusResp.getSqlState() : null; - throw new DatabricksSQLException( - errorMsg, classifyTransientSqlState(serverError, originalSqlState)); + String remappedSqlState = classifyTransientSqlState(serverError, originalSqlState); + if (!Objects.equals(remappedSqlState, originalSqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in statement [{}]", + originalSqlState, + remappedSqlState, + statementId); + } + throw new DatabricksSQLException(errorMsg, remappedSqlState); } if (statusResp.isSetOperationState() @@ -866,40 +892,16 @@ private void checkOperationStatusForErrors(TGetOperationStatusResp statusResp, S errorMsg, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } - throw new DatabricksSQLException(errorMsg, classifyTransientSqlState(serverError, sqlState)); - } - } - - /** - * Reclassifies the SQL state for known transient or mis-categorized server errors so callers can - * programmatically identify retryable failures. Returns {@code originalSqlState} when no known - * pattern matches. - * - *

Patterns handled: - * - *

    - *
  • Unity Catalog unavailability ({@code UC_CLIENT_EXCEPTION}, undocumented {@code XXUCC}) → - * {@code 08S01} (communication link failure, retryable). - *
  • Connection-acquisition / parquet read deadlines ({@code PARQUET_FAILED_READ_FOOTER}, - * {@code DEADLINE_EXCEEDED: acquiring connection}) with no SQL state → {@code 08S01}. - *
  • Server-side {@code ConcurrentModificationException} mis-mapped to {@code 42000} - * (syntax/access violation) → {@code 40001} (serialization failure, retryable). - *
- */ - static String classifyTransientSqlState(String errorMessage, String originalSqlState) { - if (errorMessage == null) { - return originalSqlState; - } - if (errorMessage.contains("ConcurrentModificationException")) { - return SERIALIZATION_FAILURE_SQLSTATE; - } - if (errorMessage.contains("UC_CLIENT_EXCEPTION") - || errorMessage.contains("Failed to contact the Unity Catalog server") - || errorMessage.contains("PARQUET_FAILED_READ_FOOTER") - || errorMessage.contains("DEADLINE_EXCEEDED: acquiring connection")) { - return COMMUNICATION_LINK_FAILURE_SQLSTATE; + String remappedSqlState = classifyTransientSqlState(serverError, sqlState); + if (!Objects.equals(remappedSqlState, sqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in statement [{}]", + sqlState, + remappedSqlState, + statementId); + } + throw new DatabricksSQLException(errorMsg, remappedSqlState); } - return originalSqlState; } /** diff --git a/src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java b/src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java new file mode 100644 index 0000000000..2475d2516c --- /dev/null +++ b/src/test/java/com/databricks/jdbc/common/util/SqlStateClassifierTest.java @@ -0,0 +1,102 @@ +package com.databricks.jdbc.common.util; + +import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.Test; + +class SqlStateClassifierTest { + + @Test + void unityCatalogBracketedTokenRemapsTo08S01() { + String ucMessage = + "Error running query: [UC_CLIENT_EXCEPTION] " + + "com.databricks.sql.managedcatalog.UnityCatalogClientException: " + + "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + assertEquals("08S01", classifyTransientSqlState(ucMessage, "XXUCC")); + } + + @Test + void parquetReadFooterBracketedTokenRemapsTo08S01() { + String message = + "Error running query: [PARQUET_FAILED_READ_FOOTER] " + + "com.databricks.sql.io.parquet.ParquetFailedReadFooterException: " + + "DEADLINE_EXCEEDED: acquiring connection"; + assertEquals("08S01", classifyTransientSqlState(message, null)); + } + + @Test + void deadlineExceededAcquiringConnectionRemapsTo08S01() { + assertEquals( + "08S01", + classifyTransientSqlState("DEADLINE_EXCEEDED: acquiring connection from pool", null)); + } + + @Test + void concurrentModificationGatedOn42000() { + String message = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + assertEquals("40001", classifyTransientSqlState(message, "42000")); + } + + @Test + void concurrentModificationDoesNotRemapWhenOriginalStateIsNot42000() { + String message = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + assertEquals("42501", classifyTransientSqlState(message, "42501")); + assertEquals("XXUCC", classifyTransientSqlState(message, "XXUCC")); + assertNull(classifyTransientSqlState(message, null)); + } + + @Test + void bareConcurrentModificationExceptionWithoutFqnDoesNotRemap() { + assertEquals( + "42000", + classifyTransientSqlState( + "SELECT 'ConcurrentModificationException' FROM nonexistent_tbl", "42000")); + } + + @Test + void unbracketedUcProseDoesNotTriggerRemap() { + assertEquals( + "42S02", + classifyTransientSqlState( + "User-supplied literal: Failed to contact the Unity Catalog server", "42S02")); + } + + @Test + void unrelatedErrorPreservesOriginalState() { + assertEquals( + "42S02", classifyTransientSqlState("Table or view not found: foo.bar.baz", "42S02")); + } + + @Test + void nullMessagePreservesOriginalState() { + assertNull(classifyTransientSqlState(null, null)); + assertEquals("42S02", classifyTransientSqlState(null, "42S02")); + } + + @Test + void emptyOriginalStateIsPreservedWhenUnrelated() { + assertEquals("", classifyTransientSqlState("Some unrelated error", "")); + } + + @Test + void caseSensitivityIsExplicit() { + assertEquals( + "42S02", + classifyTransientSqlState("Lowercase [uc_client_exception] should not match", "42S02")); + } + + @Test + void ucCheckRunsBeforeCmeWhenOriginalStateIs42000AndBothSubstringsPresent() { + String message = + "[UC_CLIENT_EXCEPTION] catalog server: caused by " + + "java.util.ConcurrentModificationException: nested"; + assertEquals("08S01", classifyTransientSqlState(message, "42000")); + } +} diff --git a/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java b/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java index 7dee69b1a1..77b279cfb4 100644 --- a/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java +++ b/src/test/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessorTest.java @@ -1226,43 +1226,47 @@ void testMetadataPollingWithSleepBetweenPolls() } @Test - void classifyTransientSqlState_unityCatalogUnavailable_remapsXxuccTo08S01() { - String ucMessage = - "Error running query: [UC_CLIENT_EXCEPTION] " - + "com.databricks.sql.managedcatalog.UnityCatalogClientException: " - + "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " - + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; - assertEquals("08S01", DatabricksThriftAccessor.classifyTransientSqlState(ucMessage, "XXUCC")); - } + void testExecute_remapsUcErrorOnStatusCodeBranchToCommunicationLinkFailure() + throws TException, SQLException, DatabricksValidationException { + setup(true); + TExecuteStatementReq request = new TExecuteStatementReq(); + TExecuteStatementResp tExecuteStatementResp = + new TExecuteStatementResp() + .setOperationHandle(tOperationHandle) + .setStatus(new TStatus().setStatusCode(TStatusCode.SUCCESS_STATUS)); - @Test - void classifyTransientSqlState_concurrentModification_remaps42000To40001() { - String message = - "Error running query: java.util.ConcurrentModificationException: " - + "mutation occurred during iteration"; - assertEquals("40001", DatabricksThriftAccessor.classifyTransientSqlState(message, "42000")); - } + String ucErrorMessage = + "Error running query: [UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + // ERROR_STATUS triggers the status-code branch in checkOperationStatusForErrors first. + TGetOperationStatusResp ucErrorResp = + new TGetOperationStatusResp() + .setStatus( + new TStatus() + .setStatusCode(TStatusCode.ERROR_STATUS) + .setErrorMessage(ucErrorMessage) + .setSqlState("XXUCC")) + .setSqlState("XXUCC") + .setOperationState(TOperationState.ERROR_STATE); - @Test - void classifyTransientSqlState_parquetReadFooterDeadline_assigns08S01WhenNullState() { - String message = - "Error running query: [PARQUET_FAILED_READ_FOOTER] " - + "com.databricks.sql.io.parquet.ParquetFailedReadFooterException: " - + "DEADLINE_EXCEEDED: acquiring connection"; - assertEquals("08S01", DatabricksThriftAccessor.classifyTransientSqlState(message, null)); - } + when(thriftClient.ExecuteStatement(request)).thenReturn(tExecuteStatementResp); + when(thriftClient.GetOperationStatus(any(TGetOperationStatusReq.class))) + .thenReturn(ucErrorResp); + Statement statement = mock(Statement.class); + when(parentStatement.getStatement()).thenReturn(statement); + when(statement.getQueryTimeout()).thenReturn(0); - @Test - void classifyTransientSqlState_unrelatedError_preservesOriginalState() { - assertEquals( - "42S02", - DatabricksThriftAccessor.classifyTransientSqlState( - "Table or view not found: foo.bar.baz", "42S02")); - assertNull(DatabricksThriftAccessor.classifyTransientSqlState(null, null)); + DatabricksSQLException e = + assertThrows( + DatabricksSQLException.class, + () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); + assertEquals("08S01", e.getSQLState(), "Expected UC error to be remapped to 08S01"); + assertNotEquals("XXUCC", e.getSQLState(), "Expected XXUCC to have been remapped"); + assertTrue(e.getMessage().contains("UC_CLIENT_EXCEPTION")); } @Test - void testExecute_remapsUnityCatalogErrorToCommunicationLinkFailure() + void testExecute_remapsUcErrorOnOperationStateBranchToCommunicationLinkFailure() throws TException, SQLException, DatabricksValidationException { setup(true); TExecuteStatementReq request = new TExecuteStatementReq(); @@ -1274,11 +1278,13 @@ void testExecute_remapsUnityCatalogErrorToCommunicationLinkFailure() String ucErrorMessage = "Error running query: [UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED"; + // SUCCESS_STATUS on TStatus skips the status-code branch and falls through to the + // operation-state branch (the second classifier call site in checkOperationStatusForErrors). TGetOperationStatusResp ucErrorResp = new TGetOperationStatusResp() .setStatus( new TStatus() - .setStatusCode(TStatusCode.ERROR_STATUS) + .setStatusCode(TStatusCode.SUCCESS_STATUS) .setErrorMessage(ucErrorMessage) .setSqlState("XXUCC")) .setSqlState("XXUCC") @@ -1295,8 +1301,50 @@ void testExecute_remapsUnityCatalogErrorToCommunicationLinkFailure() assertThrows( DatabricksSQLException.class, () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); - assertEquals("08S01", e.getSQLState(), "Expected UC error to be remapped to 08S01"); - assertTrue(e.getMessage().contains("UC_CLIENT_EXCEPTION")); + assertEquals( + "08S01", + e.getSQLState(), + "Expected UC error on operation-state branch to be remapped to 08S01"); + } + + @Test + void testExecute_remapsConcurrentModificationOnOperationStateBranchToSerializationFailure() + throws TException, SQLException, DatabricksValidationException { + setup(true); + TExecuteStatementReq request = new TExecuteStatementReq(); + TExecuteStatementResp tExecuteStatementResp = + new TExecuteStatementResp() + .setOperationHandle(tOperationHandle) + .setStatus(new TStatus().setStatusCode(TStatusCode.SUCCESS_STATUS)); + + String cmeErrorMessage = + "Error running query: java.util.ConcurrentModificationException: " + + "mutation occurred during iteration"; + TGetOperationStatusResp cmeErrorResp = + new TGetOperationStatusResp() + .setStatus( + new TStatus() + .setStatusCode(TStatusCode.SUCCESS_STATUS) + .setErrorMessage(cmeErrorMessage) + .setSqlState("42000")) + .setSqlState("42000") + .setOperationState(TOperationState.ERROR_STATE); + + when(thriftClient.ExecuteStatement(request)).thenReturn(tExecuteStatementResp); + when(thriftClient.GetOperationStatus(any(TGetOperationStatusReq.class))) + .thenReturn(cmeErrorResp); + Statement statement = mock(Statement.class); + when(parentStatement.getStatement()).thenReturn(statement); + when(statement.getQueryTimeout()).thenReturn(0); + + DatabricksSQLException e = + assertThrows( + DatabricksSQLException.class, + () -> accessor.execute(request, parentStatement, session, StatementType.SQL)); + assertEquals( + "40001", + e.getSQLState(), + "Expected ConcurrentModificationException with 42000 to be remapped to 40001"); } private TFetchResultsReq getFetchResultsRequest(boolean includeMetadata) From e4e34fcf7e29ac4925a50fcef3945419025ac682 Mon Sep 17 00:00:00 2001 From: samikshya-chand_data Date: Wed, 29 Apr 2026 10:14:26 +0530 Subject: [PATCH 3/3] Apply SQL state classifier to SEA failed-execution path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shared SqlStateClassifier helper introduced in the previous commit was applied at every Thrift-side error site, but the SEA path (DatabricksSdkClient.handleFailedExecution) still passed the server's SQL state through unmodified — so the same UC outage / parquet deadline / ConcurrentModificationException patterns surfaced via SEA would still be mis-categorized. Apply the classifier in SEA's handleFailedExecution and log the original → remapped state at INFO when they differ, matching the convention used at the Thrift sites. Adds a SEA-path test that asserts an XXUCC-tagged UC failure surfaces with 08S01. Co-authored-by: Isaac Signed-off-by: samikshya-chand_data --- .../impl/sqlexec/DatabricksSdkClient.java | 11 ++++++- .../impl/sqlexec/DatabricksSdkClientTest.java | 32 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java b/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java index 5981ac1955..168b1a0a54 100644 --- a/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java +++ b/src/main/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClient.java @@ -7,6 +7,7 @@ import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_RESULT_ROW_LIMIT; import static com.databricks.jdbc.common.util.DatabricksTypeUtil.DECIMAL; import static com.databricks.jdbc.common.util.DatabricksTypeUtil.getDecimalTypeString; +import static com.databricks.jdbc.common.util.SqlStateClassifier.classifyTransientSqlState; import static com.databricks.jdbc.dbclient.impl.sqlexec.PathConstants.*; import static com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode.TEMPORARY_REDIRECT_EXCEPTION; @@ -766,8 +767,16 @@ void handleFailedExecution( throw new DatabricksTimeoutException( errorMessage, null, DatabricksDriverErrorCode.OPERATION_TIMEOUT_ERROR); } + String remappedSqlState = classifyTransientSqlState(errorMessage, sqlState); + if (!Objects.equals(remappedSqlState, sqlState)) { + LOGGER.info( + "Remapped SQL state [{}] -> [{}] for transient error pattern in SEA statement [{}]", + sqlState, + remappedSqlState, + statementId); + } throw new DatabricksSQLException( - errorMessage, sqlState, DatabricksDriverErrorCode.EXECUTE_STATEMENT_FAILED); + errorMessage, remappedSqlState, DatabricksDriverErrorCode.EXECUTE_STATEMENT_FAILED); } private ExecuteStatementResponse wrapGetStatementResponse( diff --git a/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java b/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java index 16f094ffde..66cd3afdec 100644 --- a/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java +++ b/src/test/java/com/databricks/jdbc/dbclient/impl/sqlexec/DatabricksSdkClientTest.java @@ -339,6 +339,38 @@ public void testHandleFailedExecution_FailedState_ThrowsWithoutHY008() throws Ex assertTrue(exception.getMessage().contains("execution failed")); } + @Test + public void testHandleFailedExecution_unityCatalogError_remapsToCommunicationLinkFailure() + throws Exception { + IDatabricksConnectionContext connectionContext = + DatabricksConnectionContext.parse(JDBC_URL, new Properties()); + DatabricksSdkClient databricksSdkClient = + new DatabricksSdkClient(connectionContext, statementExecutionService, apiClient); + + StatementStatus failedStatus = + new StatementStatus() + .setState(StatementState.FAILED) + .setSqlState("XXUCC") + .setError( + new ServiceError() + .setMessage( + "[UC_CLIENT_EXCEPTION] Failed to contact the Unity Catalog server. " + + "HTTP/1.1 504 Gateway Timeout, DEADLINE_EXCEEDED")); + ExecuteStatementResponse response = + new ExecuteStatementResponse() + .setStatementId(STATEMENT_ID.toSQLExecStatementId()) + .setStatus(failedStatus); + + DatabricksSQLException exception = + assertThrows( + DatabricksSQLException.class, + () -> + databricksSdkClient.handleFailedExecution( + response, STATEMENT_ID.toSQLExecStatementId(), STATEMENT)); + + assertEquals("08S01", exception.getSQLState(), "Expected XXUCC to be remapped to 08S01"); + } + @Test public void testGetStatementResult_CancelledState_ThrowsWithHY008() throws Exception { IDatabricksConnectionContext connectionContext =