Fetch columnar results incrementally/lazily (#966)

jayantsing-db · web-flow · commit 970c4c8b41fa · 2025-09-05T17:02:05.000+05:30
## Description when JDBC is used with Arrow disabled and operating in Thrift mode (the default mode or when useThriftClient=1), the Hive ThriftServer returns results in a columnar binary format (non-Arrow). Currently, before constructing the result set, JDBC fetches all the required data at once and buffers the entire result set in memory. This PR modifies this behavior for the disabled Arrow and Thrift mode case, so that the result set maintains only a limited number of rows in memory at a time, controlled by the connection parameter RowsFetchedPerBlock. As the client reads rows, the result set will issue additional fetch requests incrementally. Implications: The JVM no longer buffers the entire result set upfront, which previously caused a sharp spike in heap memory usage before garbage collection could reclaim space. This spike often led to OutOfMemory (OOM) errors. With this change, peak heap memory usage is reduced, increasing more gradually, which allows the JVM’s memory management to work more effectively. Following improvements were observed when executing a SQL query on 1 million rows and iterating through the result set without printing the rows. Existing behaviour. Sudden spike with higher peak: <img width="719" height="272" alt="image" src="https://github.com/user-attachments/assets/1e9cf4c6-4847-4b39-a160-74175ca3f871" /> Gradual heap increase within memory management bounds with less peak: <img width="720" height="284" alt="image" src="https://github.com/user-attachments/assets/cb42c585-5af5-492b-8a12-1b34a50a4274" /> ## Testing  - e2e tests - unit tests - fake service tests ## Additional Notes to the Reviewer  Note this does not lower the max heap usage by a lot (instead make memory increase gradual). I will make further improvements on how we process the data.
diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md
@@ -10,6 +10,7 @@
 - Support for fetching schemas across all catalogs (when catalog is specified as null or a wildcard) in `DatabaseMetaData#getSchemas` API in SQL Execution mode.
 - **Configurable SQL validation in isValid()**: Added `EnableSQLValidationForIsValid` connection property to control whether `isValid()` method executes an actual SQL query for server-side validation. Default value is 0.
 - Implement multi-row INSERT batching optimization for prepared statements to improve performance when executing large batches of INSERT operations.
+- Implement lazy/incremental fetching for columnar results when using Databricks JDBC in Thrift mode without Arrow support. The change modifies the behavior from buffering entire result sets in memory to maintaining only a limited number of rows at a time, reducing peak heap memory usage and preventing OutOfMemory errors.
 
 ### Updated
 - Databricks SDK dependency upgraded to latest version 0.60.0
diff --git a/src/main/java/com/databricks/jdbc/api/impl/DatabricksResultSet.java b/src/main/java/com/databricks/jdbc/api/impl/DatabricksResultSet.java
@@ -583,6 +583,16 @@ public boolean isBeforeFirst() throws SQLException {
     return executionResult.getCurrentRow() == -1;
   }
 
+  /**
+   * {@inheritDoc}
+   *
+   * <p><b>Limitation:</b> For lazy-loaded result sets ({@link LazyThriftResult}), particularly
+   * those using {@link
+   * com.databricks.jdbc.model.client.thrift.generated.TSparkRowSetType#COLUMN_BASED_SET}, this
+   * method cannot reliably determine the cursor position. The total row count remains unknown until
+   * all rows are fetched, preventing accurate detection of whether the cursor is after the last
+   * row. This is specific to Databricks JDBC dialect.
+   */
   @Override
   public boolean isAfterLast() throws SQLException {
     checkIfClosed();
@@ -595,9 +605,27 @@ public boolean isFirst() throws SQLException {
     return executionResult.getCurrentRow() == 0;
   }
 
+  /**
+   * {@inheritDoc}
+   *
+   * <p>This method uses different strategies based on the result set type:
+   *
+   * <ul>
+   *   <li>For {@link LazyThriftResult} instances: Checks if there are no more rows available (using
+   *       {@code hasNext()}), since the total row count is unknown until all rows are fetched.
+   *   <li>For other result types: Compares the current row position against the known total row
+   *       count.
+   * </ul>
+   *
+   * @return {@code true} if the cursor is on the last row, {@code false} otherwise
+   * @throws SQLException if the result set is closed or an error occurs
+   */
   @Override
   public boolean isLast() throws SQLException {
     checkIfClosed();
+    if (executionResult instanceof LazyThriftResult) {
+      return executionResult.getCurrentRow() >= 0 && !executionResult.hasNext();
+    }
     return executionResult.getCurrentRow() == resultSetMetaData.getTotalRows() - 1;
   }
 
diff --git a/src/main/java/com/databricks/jdbc/api/impl/ExecutionResultFactory.java b/src/main/java/com/databricks/jdbc/api/impl/ExecutionResultFactory.java
@@ -1,7 +1,5 @@
 package com.databricks.jdbc.api.impl;
 
-import static com.databricks.jdbc.common.util.DatabricksThriftUtil.convertColumnarToRowBased;
-
 import com.databricks.jdbc.api.impl.arrow.ArrowStreamResult;
 import com.databricks.jdbc.api.impl.volume.VolumeOperationResult;
 import com.databricks.jdbc.api.internal.IDatabricksSession;
@@ -96,7 +94,7 @@ private static IExecutionResult getResultHandler(
     LOGGER.info("Processing result of format {} from Thrift server", resultFormat);
     switch (resultFormat) {
       case COLUMN_BASED_SET:
-        return getResultSet(convertColumnarToRowBased(resultsResp, parentStatement, session));
+        return new LazyThriftResult(resultsResp, parentStatement, session);
       case ARROW_BASED_SET:
         return new ArrowStreamResult(resultsResp, true, parentStatement, session);
       case URL_BASED_SET:
diff --git a/src/main/java/com/databricks/jdbc/api/impl/LazyThriftResult.java b/src/main/java/com/databricks/jdbc/api/impl/LazyThriftResult.java
@@ -0,0 +1,266 @@
+package com.databricks.jdbc.api.impl;
+
+import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_RESULT_ROW_LIMIT;
+import static com.databricks.jdbc.common.util.DatabricksThriftUtil.extractRowsFromColumnar;
+
+import com.databricks.jdbc.api.internal.IDatabricksSession;
+import com.databricks.jdbc.api.internal.IDatabricksStatementInternal;
+import com.databricks.jdbc.exception.DatabricksSQLException;
+import com.databricks.jdbc.log.JdbcLogger;
+import com.databricks.jdbc.log.JdbcLoggerFactory;
+import com.databricks.jdbc.model.client.thrift.generated.TFetchResultsResp;
+import com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode;
+import java.util.List;
+
+public class LazyThriftResult implements IExecutionResult {
+  private static final JdbcLogger LOGGER = JdbcLoggerFactory.getLogger(LazyThriftResult.class);
+
+  private TFetchResultsResp currentResponse;
+  private List<List<Object>> currentBatch;
+  private int currentBatchIndex;
+  private long globalRowIndex;
+  private final IDatabricksSession session;
+  private final IDatabricksStatementInternal statement;
+  private final int maxRows;
+  private boolean hasReachedEnd;
+  private boolean isClosed;
+  private long totalRowsFetched;
+
+  /**
+   * Creates a new LazyThriftResult that lazily fetches data on demand.
+   *
+   * @param initialResponse the initial response from the server
+   * @param statement the statement that generated this result
+   * @param session the session to use for fetching additional data
+   * @throws DatabricksSQLException if the initial response cannot be processed
+   */
+  public LazyThriftResult(
+      TFetchResultsResp initialResponse,
+      IDatabricksStatementInternal statement,
+      IDatabricksSession session)
+      throws DatabricksSQLException {
+    this.currentResponse = initialResponse;
+    this.statement = statement;
+    this.session = session;
+    this.maxRows = statement != null ? statement.getMaxRows() : DEFAULT_RESULT_ROW_LIMIT;
+    this.globalRowIndex = -1;
+    this.currentBatchIndex = -1;
+    this.hasReachedEnd = false;
+    this.isClosed = false;
+    this.totalRowsFetched = 0;
+
+    // Load initial batch
+    loadCurrentBatch();
+    LOGGER.debug(
+        "LazyThriftResult initialized with {} rows in first batch, hasMoreRows: {}",
+        currentBatch.size(),
+        currentResponse.hasMoreRows);
+  }
+
+  /**
+   * Gets the value at the specified column index for the current row.
+   *
+   * @param columnIndex the zero-based column index
+   * @return the value at the specified column
+   * @throws DatabricksSQLException if the result is closed, cursor is invalid, or column index is
+   *     out of bounds
+   */
+  @Override
+  public Object getObject(int columnIndex) throws DatabricksSQLException {
+    if (isClosed) {
+      throw new DatabricksSQLException(
+          "Result is already closed", DatabricksDriverErrorCode.STATEMENT_CLOSED);
+    }
+    if (globalRowIndex == -1) {
+      throw new DatabricksSQLException(
+          "Cursor is before first row", DatabricksDriverErrorCode.INVALID_STATE);
+    }
+    if (currentBatchIndex < 0 || currentBatchIndex >= currentBatch.size()) {
+      throw new DatabricksSQLException(
+          "Invalid cursor position", DatabricksDriverErrorCode.INVALID_STATE);
+    }
+    List<Object> currentRowData = currentBatch.get(currentBatchIndex);
+    if (columnIndex < 0 || columnIndex >= currentRowData.size()) {
+      throw new DatabricksSQLException(
+          "Column index out of bounds " + columnIndex, DatabricksDriverErrorCode.INVALID_STATE);
+    }
+    return currentRowData.get(columnIndex);
+  }
+
+  /**
+   * Gets the current row index (0-based). Returns -1 if before the first row.
+   *
+   * @return the current row index
+   */
+  @Override
+  public long getCurrentRow() {
+    return globalRowIndex;
+  }
+
+  /**
+   * Moves the cursor to the next row. Fetches additional data from server if needed.
+   *
+   * @return true if there is a next row, false if at the end
+   * @throws DatabricksSQLException if an error occurs while fetching data
+   */
+  @Override
+  public boolean next() throws DatabricksSQLException {
+    if (isClosed || hasReachedEnd) {
+      return false;
+    }
+
+    if (!hasNext()) {
+      // Ideally the client code should first call, hasNext() and then next()
+      // However, the client code like in DatabricksResultSet#next directly calls next
+      // So, this is a safeguard to ensure we don't move past the end
+      return false;
+    }
+
+    // Check if we've reached the maxRows limit
+    boolean hasRowLimit = maxRows > 0;
+    if (hasRowLimit && globalRowIndex + 1 >= maxRows) {
+      hasReachedEnd = true;
+      return false;
+    }
+
+    // Move to next row in current batch
+    currentBatchIndex++;
+    globalRowIndex++;
+
+    // Check if we need to fetch the next batch
+    if (currentBatchIndex >= currentBatch.size()) {
+      // Keep fetching until we get a non-empty batch or no more rows
+      while (currentResponse.hasMoreRows) {
+        fetchNextBatch();
+
+        // If we got a non-empty batch, we can proceed
+        if (!currentBatch.isEmpty()) {
+          currentBatchIndex = 0; // Reset to first row of new batch
+          break;
+        }
+
+        // If batch is still empty but hasMoreRows is false after fetch, we'll exit the loop
+      }
+
+      // If we exited the loop and still have an empty batch, we've reached the end
+      if (currentBatch.isEmpty()) {
+        hasReachedEnd = true;
+        globalRowIndex--; // Revert the increment since we didn't actually move to a new row
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Checks if there are more rows available without advancing the cursor.
+   *
+   * @return true if there are more rows, false otherwise
+   */
+  @Override
+  public boolean hasNext() {
+    if (isClosed || hasReachedEnd) {
+      return false;
+    }
+
+    // Check maxRows limit
+    boolean hasRowLimit = maxRows > 0;
+    if (hasRowLimit && globalRowIndex + 1 >= maxRows) {
+      return false;
+    }
+
+    // Check if there are more rows in current batch
+    if (currentBatchIndex + 1 < currentBatch.size()) {
+      return true;
+    }
+
+    // Check if there are more batches to fetch
+    return currentResponse.hasMoreRows;
+  }
+
+  /** Closes this result and releases associated resources. */
+  @Override
+  public void close() {
+    this.isClosed = true;
+    this.currentBatch = null;
+    this.currentResponse = null;
+    LOGGER.debug("LazyThriftResult closed after fetching {} total rows", totalRowsFetched);
+  }
+
+  /**
+   * Gets the number of rows in the current batch.
+   *
+   * @return the number of rows in the current batch
+   */
+  @Override
+  public long getRowCount() {
+    // Return the number of rows in the current batch
+    return currentBatch != null ? currentBatch.size() : 0;
+  }
+
+  /**
+   * Gets the chunk count. Always returns 0 for thrift columnar results.
+   *
+   * @return 0 (thrift results don't use chunks like Arrow)
+   */
+  @Override
+  public long getChunkCount() {
+    // For thrift columnar results, we don't have chunks in the same sense as Arrow
+    return 0;
+  }
+
+  /**
+   * Loads the current response data into memory as a batch of rows.
+   *
+   * @throws DatabricksSQLException if the response data cannot be processed
+   */
+  private void loadCurrentBatch() throws DatabricksSQLException {
+    currentBatch = extractRowsFromColumnar(currentResponse.getResults());
+    currentBatchIndex = -1; // Reset batch index
+    totalRowsFetched += currentBatch.size();
+    LOGGER.debug(
+        "Loaded batch with {} rows, total fetched: {}", currentBatch.size(), totalRowsFetched);
+  }
+
+  /**
+   * Fetches the next batch of data from the server and loads it into memory.
+   *
+   * @throws DatabricksSQLException if the fetch operation fails
+   */
+  private void fetchNextBatch() throws DatabricksSQLException {
+    try {
+      LOGGER.debug("Fetching next batch, current total rows fetched: {}", totalRowsFetched);
+      currentResponse = session.getDatabricksClient().getMoreResults(statement);
+      loadCurrentBatch();
+
+      LOGGER.debug(
+          "Fetched batch with {} rows, hasMoreRows: {}",
+          currentBatch.size(),
+          currentResponse.hasMoreRows);
+    } catch (DatabricksSQLException e) {
+      LOGGER.error("Failed to fetch next batch: {}", e.getMessage());
+      hasReachedEnd = true;
+      throw e; // Propagate exception to fail fast
+    }
+  }
+
+  /**
+   * Gets the total number of rows fetched from the server so far. This is different from
+   * getRowCount() which returns current batch size.
+   *
+   * @return the total number of rows fetched from the server
+   */
+  public long getTotalRowsFetched() {
+    return totalRowsFetched;
+  }
+
+  /**
+   * Checks if all data has been fetched from the server.
+   *
+   * @return true if all data has been fetched (either reached end or maxRows limit)
+   */
+  public boolean isCompletelyFetched() {
+    return hasReachedEnd || !currentResponse.hasMoreRows;
+  }
+}
diff --git a/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java b/src/main/java/com/databricks/jdbc/dbclient/impl/thrift/DatabricksThriftAccessor.java
@@ -253,7 +253,7 @@ DatabricksResultSet execute(
             getResultSetResp(
                 response.getStatus(),
                 response.getOperationHandle(),
-                response.toString(),
+                "executeStatement",
                 maxRowsPerBlock,
                 true);
         long fetchEndTime = System.nanoTime();
@@ -294,7 +294,9 @@ private TGetOperationStatusResp pollTillOperationFinished(
     TGetOperationStatusResp statusResp = null;
     if (response.isSetDirectResults()) {
       checkDirectResultsForErrorStatus(
-          response.getDirectResults(), response.toString(), statementId.toSQLExecStatementId());
+          response.getDirectResults(),
+          "executeStatement DirectResults",
+          statementId.toSQLExecStatementId());
       statusResp = response.getDirectResults().getOperationStatus();
       checkOperationStatusForErrors(
           statusResp, StatementId.loggableStatementId(response.getOperationHandle()));
@@ -409,7 +411,7 @@ DatabricksResultSet getStatementResult(
       if (operationState == TOperationState.FINISHED_STATE) {
         long fetchStartTime = System.nanoTime();
         resultSet =
-            getResultSetResp(response.getStatus(), operationHandle, response.toString(), -1, true);
+            getResultSetResp(response.getStatus(), operationHandle, "getStatementResult", -1, true);
         long fetchEndTime = System.nanoTime();
         long fetchLatencyNanos = fetchEndTime - fetchStartTime;
         long fetchLatencyMillis = fetchLatencyNanos / 1_000_000;
@@ -523,16 +525,16 @@ TFetchResultsResp getResultSetResp(
     } catch (TException e) {
       String errorMessage =
           String.format(
-              "Error while fetching results from Thrift server. Request {%s}, Error {%s}",
-              request.toString(), e.getMessage());
+              "Error while fetching results from Thrift server. Request maxRows=%d, maxBytes=%d, Error {%s}",
+              request.getMaxRows(), request.getMaxBytes(), e.getMessage());
       LOGGER.error(e, errorMessage);
       throw new DatabricksHttpException(errorMessage, e, DatabricksDriverErrorCode.INVALID_STATE);
     }
     verifySuccessStatus(
         response.getStatus(),
         String.format(
-            "Error while fetching results Request {%s}. TFetchResultsResp {%s}. ",
-            request, response),
+            "Error while fetching results Request maxRows=%d, maxBytes=%d. Response hasMoreRows=%s",
+            request.getMaxRows(), request.getMaxBytes(), response.hasMoreRows),
         statementId);
     return response;
   }
diff --git a/src/test/java/com/databricks/jdbc/api/impl/ExecutionResultFactoryTest.java b/src/test/java/com/databricks/jdbc/api/impl/ExecutionResultFactoryTest.java
@@ -101,7 +101,7 @@ public void testGetResultSet_thriftColumnar() throws SQLException {
     when(fetchResultsResp.getResultSetMetadata()).thenReturn(resultSetMetadataResp);
     IExecutionResult result =
         ExecutionResultFactory.getResultSet(fetchResultsResp, session, parentStatement);
-    assertInstanceOf(InlineJsonResult.class, result);
+    assertInstanceOf(LazyThriftResult.class, result);
   }
 
   @Test
diff --git a/src/test/java/com/databricks/jdbc/api/impl/LazyThriftResultTest.java b/src/test/java/com/databricks/jdbc/api/impl/LazyThriftResultTest.java

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ public void testGetResultSet_thriftColumnar() throws SQLException {`
`101`	`101`	`when(fetchResultsResp.getResultSetMetadata()).thenReturn(resultSetMetadataResp);`
`102`	`102`	`IExecutionResult result =`
`103`	`103`	`ExecutionResultFactory.getResultSet(fetchResultsResp, session, parentStatement);`
`104`		`- assertInstanceOf(InlineJsonResult.class, result);`
	`104`	`+ assertInstanceOf(LazyThriftResult.class, result);`
`105`	`105`	`}`
`106`	`106`
`107`	`107`	`@Test`