Add columnar data access for memory-efficient row processing (#975)

jayantsing-db · web-flow · commit 43ac32b3cba8 · 2025-09-11T11:23:38.000Z
## Description  This PR contains changes from the PR #966 as well. Introduce ColumnarRowView to provide direct access to columnar data without materialising entire result sets into row objects. This reduces memory allocations by allowing individual cell access via `getValue(row, col)` instead of creating `List<List<Object>>` structures. Key changes: - New ColumnarRowView class with direct columnar access methods. - Updated LazyThriftResult to use columnar view instead of materialized rows. - Added utility method in DatabricksThriftUtil for creating columnar views. - Comprehensive test coverage for all column types and null handling. This optimization maintains API compatibility while significantly reducing memory overhead for large result sets. Following the changes introduced in PR #966, the following improvements were observed during a test that executes a SQL query retrieving 5 million rows: Current heap usage over time: <img width="720" height="429" alt="image" src="https://github.com/user-attachments/assets/ac87eae3-63aa-4529-9995-6392af6d4a3b" /> Improved heap usage over time: <img width="720" height="288" alt="image" src="https://github.com/user-attachments/assets/befff1f2-a54f-4609-aa81-a6311a329312" /> - The first image shows multiple significant CPU usage spikes reaching around 15%, while the second image shows consistently flat CPU usage at approximately 3%. - The erratic CPU behavior in the "before" state has been completely smoothed out. - CPU usage is now consistent and predictable rather than volatile. - Memory usage dropped from a peak of 8440 MB down to just 745 MB - that's roughly an 91% decrease. - The large memory spike and sustained high usage in the first image has been completely resolved. - Memory usage is now consistently low and stable throughout the monitoring period. - There is no negative effect on execution time; in fact, it appears to improve, likely due to more active GC pauses in the previous state. ## Testing  - Unit tests - e2e tests - FakeService tests ## Additional Notes to the Reviewer
diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md
@@ -11,7 +11,8 @@
 - **Configurable SQL validation in isValid()**: Added `EnableSQLValidationForIsValid` connection property to control whether `isValid()` method executes an actual SQL query for server-side validation. Default value is 0.
 - Implement multi-row INSERT batching optimization for prepared statements to improve performance when executing large batches of INSERT operations.
 - Implement lazy/incremental fetching for columnar results when using Databricks JDBC in Thrift mode without Arrow support. The change modifies the behavior from buffering entire result sets in memory to maintaining only a limited number of rows at a time, reducing peak heap memory usage and preventing OutOfMemory errors.
-- Added new artifact `databricks-jdbc-thin` for thin jar with runtime dependency metadata
+- Added new artifact `databricks-jdbc-thin` for thin jar with runtime dependency metadata.
+- Introduce a memory-efficient columnar data access mechanism for JDBC result processing.
 
 ### Updated
 - Databricks SDK dependency upgraded to latest version 0.60.0
diff --git a/src/main/java/com/databricks/jdbc/api/impl/ColumnarRowView.java b/src/main/java/com/databricks/jdbc/api/impl/ColumnarRowView.java
@@ -0,0 +1,159 @@
+package com.databricks.jdbc.api.impl;
+
+import com.databricks.jdbc.exception.DatabricksSQLException;
+import com.databricks.jdbc.model.client.thrift.generated.TColumn;
+import com.databricks.jdbc.model.client.thrift.generated.TRowSet;
+import com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode;
+import java.util.BitSet;
+import java.util.List;
+
+/**
+ * Memory-efficient columnar view that provides row-based access without materializing all rows.
+ * Instead of creating List<List<Object>>, this class provides direct access to columnar data on a
+ * per-row, per-column basis, significantly reducing memory allocations.
+ */
+public class ColumnarRowView {
+  private final List<TColumn> columns;
+  private final int rowCount;
+  private final ColumnAccessor[] columnAccessors;
+
+  public ColumnarRowView(TRowSet rowSet) throws DatabricksSQLException {
+    this.columns = rowSet != null ? rowSet.getColumns() : null;
+
+    if (columns == null || columns.isEmpty()) {
+      this.rowCount = 0;
+      this.columnAccessors = new ColumnAccessor[0];
+    } else {
+      this.rowCount = getRowCountFromFirstColumn();
+      this.columnAccessors = new ColumnAccessor[columns.size()];
+      for (int i = 0; i < columns.size(); i++) {
+        this.columnAccessors[i] = createColumnAccessor(columns.get(i));
+      }
+    }
+  }
+
+  /** Gets the number of rows in this view. */
+  public int getRowCount() {
+    return rowCount;
+  }
+
+  /** Gets the number of columns in this view. */
+  public int getColumnCount() {
+    return columns != null ? columns.size() : 0;
+  }
+
+  /** Gets the value at the specified row and column without materializing the entire row. */
+  public Object getValue(int rowIndex, int columnIndex) throws DatabricksSQLException {
+    if (rowIndex < 0 || rowIndex >= rowCount) {
+      throw new DatabricksSQLException(
+          "Row index out of bounds: " + rowIndex, DatabricksDriverErrorCode.INVALID_STATE);
+    }
+    if (columnIndex < 0 || columnIndex >= columnAccessors.length) {
+      throw new DatabricksSQLException(
+          "Column index out of bounds: " + columnIndex, DatabricksDriverErrorCode.INVALID_STATE);
+    }
+
+    return columnAccessors[columnIndex].getValue(rowIndex);
+  }
+
+  /**
+   * Creates a materialized row only when explicitly requested (for backward compatibility). This
+   * should be avoided in performance-critical paths.
+   */
+  public Object[] materializeRow(int rowIndex) throws DatabricksSQLException {
+    if (rowIndex < 0 || rowIndex >= rowCount) {
+      throw new DatabricksSQLException(
+          "Row index out of bounds: " + rowIndex, DatabricksDriverErrorCode.INVALID_STATE);
+    }
+
+    Object[] row = new Object[columnAccessors.length];
+    for (int col = 0; col < columnAccessors.length; col++) {
+      row[col] = columnAccessors[col].getValue(rowIndex);
+    }
+    return row;
+  }
+
+  private int getRowCountFromFirstColumn() throws DatabricksSQLException {
+    if (columns.isEmpty()) {
+      return 0;
+    }
+    TColumn firstColumn = columns.get(0);
+    return getColumnSize(firstColumn);
+  }
+
+  private static int getColumnSize(TColumn column) throws DatabricksSQLException {
+    if (column.isSetBinaryVal()) return column.getBinaryVal().getValuesSize();
+    if (column.isSetBoolVal()) return column.getBoolVal().getValuesSize();
+    if (column.isSetByteVal()) return column.getByteVal().getValuesSize();
+    if (column.isSetDoubleVal()) return column.getDoubleVal().getValuesSize();
+    if (column.isSetI16Val()) return column.getI16Val().getValuesSize();
+    if (column.isSetI32Val()) return column.getI32Val().getValuesSize();
+    if (column.isSetI64Val()) return column.getI64Val().getValuesSize();
+    if (column.isSetStringVal()) return column.getStringVal().getValuesSize();
+
+    throw new DatabricksSQLException(
+        "Unsupported column type: " + column, DatabricksDriverErrorCode.UNSUPPORTED_OPERATION);
+  }
+
+  private static ColumnAccessor createColumnAccessor(TColumn column) throws DatabricksSQLException {
+    if (column.isSetBinaryVal()) {
+      return new TypedColumnAccessor<>(
+          column.getBinaryVal().getValues(), column.getBinaryVal().getNulls());
+    }
+    if (column.isSetBoolVal()) {
+      return new TypedColumnAccessor<>(
+          column.getBoolVal().getValues(), column.getBoolVal().getNulls());
+    }
+    if (column.isSetByteVal()) {
+      return new TypedColumnAccessor<>(
+          column.getByteVal().getValues(), column.getByteVal().getNulls());
+    }
+    if (column.isSetDoubleVal()) {
+      return new TypedColumnAccessor<>(
+          column.getDoubleVal().getValues(), column.getDoubleVal().getNulls());
+    }
+    if (column.isSetI16Val()) {
+      return new TypedColumnAccessor<>(
+          column.getI16Val().getValues(), column.getI16Val().getNulls());
+    }
+    if (column.isSetI32Val()) {
+      return new TypedColumnAccessor<>(
+          column.getI32Val().getValues(), column.getI32Val().getNulls());
+    }
+    if (column.isSetI64Val()) {
+      return new TypedColumnAccessor<>(
+          column.getI64Val().getValues(), column.getI64Val().getNulls());
+    }
+    if (column.isSetStringVal()) {
+      return new TypedColumnAccessor<>(
+          column.getStringVal().getValues(), column.getStringVal().getNulls());
+    }
+
+    throw new DatabricksSQLException(
+        "Unsupported column type: " + column, DatabricksDriverErrorCode.UNSUPPORTED_OPERATION);
+  }
+
+  /** Interface for accessing column values by index without materializing the entire column. */
+  private interface ColumnAccessor {
+    Object getValue(int rowIndex);
+  }
+
+  /** Memory-efficient column accessor that handles nulls and provides direct index-based access. */
+  private static class TypedColumnAccessor<T> implements ColumnAccessor {
+    private final List<T> values;
+    private final BitSet nullBits;
+
+    public TypedColumnAccessor(List<T> values, byte[] nulls) {
+      this.values = values;
+      this.nullBits = nulls != null ? BitSet.valueOf(nulls) : null;
+    }
+
+    @Override
+    public Object getValue(int rowIndex) {
+      if (nullBits != null && nullBits.get(rowIndex)) {
+        return null;
+      }
+      return values.get(rowIndex);
+    }
+  }
+}
diff --git a/src/main/java/com/databricks/jdbc/api/impl/LazyThriftResult.java b/src/main/java/com/databricks/jdbc/api/impl/LazyThriftResult.java
@@ -1,7 +1,7 @@
 package com.databricks.jdbc.api.impl;
 
 import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_RESULT_ROW_LIMIT;
-import static com.databricks.jdbc.common.util.DatabricksThriftUtil.extractRowsFromColumnar;
+import static com.databricks.jdbc.common.util.DatabricksThriftUtil.createColumnarView;
 
 import com.databricks.jdbc.api.internal.IDatabricksSession;
 import com.databricks.jdbc.api.internal.IDatabricksStatementInternal;
@@ -10,13 +10,12 @@
 import com.databricks.jdbc.log.JdbcLoggerFactory;
 import com.databricks.jdbc.model.client.thrift.generated.TFetchResultsResp;
 import com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode;
-import java.util.List;
 
 public class LazyThriftResult implements IExecutionResult {
   private static final JdbcLogger LOGGER = JdbcLoggerFactory.getLogger(LazyThriftResult.class);
 
   private TFetchResultsResp currentResponse;
-  private List<List<Object>> currentBatch;
+  private ColumnarRowView currentBatch;
   private int currentBatchIndex;
   private long globalRowIndex;
   private final IDatabricksSession session;
@@ -53,7 +52,7 @@ public LazyThriftResult(
     loadCurrentBatch();
     LOGGER.debug(
         "LazyThriftResult initialized with {} rows in first batch, hasMoreRows: {}",
-        currentBatch.size(),
+        currentBatch.getRowCount(),
         currentResponse.hasMoreRows);
   }
 
@@ -75,16 +74,15 @@ public Object getObject(int columnIndex) throws DatabricksSQLException {
       throw new DatabricksSQLException(
           "Cursor is before first row", DatabricksDriverErrorCode.INVALID_STATE);
     }
-    if (currentBatchIndex < 0 || currentBatchIndex >= currentBatch.size()) {
+    if (currentBatchIndex < 0 || currentBatchIndex >= currentBatch.getRowCount()) {
       throw new DatabricksSQLException(
           "Invalid cursor position", DatabricksDriverErrorCode.INVALID_STATE);
     }
-    List<Object> currentRowData = currentBatch.get(currentBatchIndex);
-    if (columnIndex < 0 || columnIndex >= currentRowData.size()) {
+    if (columnIndex < 0 || columnIndex >= currentBatch.getColumnCount()) {
       throw new DatabricksSQLException(
           "Column index out of bounds " + columnIndex, DatabricksDriverErrorCode.INVALID_STATE);
     }
-    return currentRowData.get(columnIndex);
+    return currentBatch.getValue(currentBatchIndex, columnIndex);
   }
 
   /**
@@ -128,13 +126,13 @@ public boolean next() throws DatabricksSQLException {
     globalRowIndex++;
 
     // Check if we need to fetch the next batch
-    if (currentBatchIndex >= currentBatch.size()) {
+    if (currentBatchIndex >= currentBatch.getRowCount()) {
       // Keep fetching until we get a non-empty batch or no more rows
       while (currentResponse.hasMoreRows) {
         fetchNextBatch();
 
         // If we got a non-empty batch, we can proceed
-        if (!currentBatch.isEmpty()) {
+        if (currentBatch.getRowCount() > 0) {
           currentBatchIndex = 0; // Reset to first row of new batch
           break;
         }
@@ -143,7 +141,7 @@ public boolean next() throws DatabricksSQLException {
       }
 
       // If we exited the loop and still have an empty batch, we've reached the end
-      if (currentBatch.isEmpty()) {
+      if (currentBatch.getRowCount() == 0) {
         hasReachedEnd = true;
         globalRowIndex--; // Revert the increment since we didn't actually move to a new row
         return false;
@@ -171,7 +169,7 @@ public boolean hasNext() {
     }
 
     // Check if there are more rows in current batch
-    if (currentBatchIndex + 1 < currentBatch.size()) {
+    if (currentBatchIndex + 1 < currentBatch.getRowCount()) {
       return true;
     }
 
@@ -196,7 +194,7 @@ public void close() {
   @Override
   public long getRowCount() {
     // Return the number of rows in the current batch
-    return currentBatch != null ? currentBatch.size() : 0;
+    return currentBatch != null ? currentBatch.getRowCount() : 0;
   }
 
   /**
@@ -210,21 +208,18 @@ public long getChunkCount() {
     return 0;
   }
 
-  /**
-   * Loads the current response data into memory as a batch of rows.
-   *
-   * @throws DatabricksSQLException if the response data cannot be processed
-   */
   private void loadCurrentBatch() throws DatabricksSQLException {
-    currentBatch = extractRowsFromColumnar(currentResponse.getResults());
+    currentBatch = createColumnarView(currentResponse.getResults());
     currentBatchIndex = -1; // Reset batch index
-    totalRowsFetched += currentBatch.size();
+    totalRowsFetched += currentBatch.getRowCount();
     LOGGER.debug(
-        "Loaded batch with {} rows, total fetched: {}", currentBatch.size(), totalRowsFetched);
+        "Loaded batch with {} rows, total fetched: {}",
+        currentBatch.getRowCount(),
+        totalRowsFetched);
   }
 
   /**
-   * Fetches the next batch of data from the server and loads it into memory.
+   * Fetches the next batch of data from the server and creates columnar views.
    *
    * @throws DatabricksSQLException if the fetch operation fails
    */
@@ -236,7 +231,7 @@ private void fetchNextBatch() throws DatabricksSQLException {
 
       LOGGER.debug(
           "Fetched batch with {} rows, hasMoreRows: {}",
-          currentBatch.size(),
+          currentBatch.getRowCount(),
           currentResponse.hasMoreRows);
     } catch (DatabricksSQLException e) {
       LOGGER.error("Failed to fetch next batch: {}", e.getMessage());
diff --git a/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java b/src/main/java/com/databricks/jdbc/common/util/DatabricksThriftUtil.java
@@ -4,6 +4,7 @@
 import static com.databricks.jdbc.common.util.DatabricksTypeUtil.*;
 import static com.databricks.jdbc.model.client.thrift.generated.TTypeId.*;
 
+import com.databricks.jdbc.api.impl.ColumnarRowView;
 import com.databricks.jdbc.api.internal.IDatabricksSession;
 import com.databricks.jdbc.api.internal.IDatabricksStatementInternal;
 import com.databricks.jdbc.common.DatabricksJdbcConstants;
@@ -125,6 +126,17 @@ public static List<List<Object>> extractRowsFromColumnar(TRowSet rowSet)
     return rows;
   }
 
+  /**
+   * Memory-efficient alternative that creates a columnar view instead of materializing all rows.
+   * Use this method to significantly reduce memory allocations when accessing row data.
+   *
+   * @param rowSet that contains columnar data
+   * @return a columnar view that provides row-based access without full materialization
+   */
+  public static ColumnarRowView createColumnarView(TRowSet rowSet) throws DatabricksSQLException {
+    return new ColumnarRowView(rowSet);
+  }
+
   /** Returns statement status for given operation status response */
   public static StatementStatus getStatementStatus(TGetOperationStatusResp resp) {
     StatementState state = null;
diff --git a/src/test/java/com/databricks/jdbc/api/impl/ColumnarRowViewTest.java b/src/test/java/com/databricks/jdbc/api/impl/ColumnarRowViewTest.java