Skip to content
3 changes: 2 additions & 1 deletion NEXT_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
- **Configurable SQL validation in isValid()**: Added `EnableSQLValidationForIsValid` connection property to control whether `isValid()` method executes an actual SQL query for server-side validation. Default value is 0.
- Implement multi-row INSERT batching optimization for prepared statements to improve performance when executing large batches of INSERT operations.
- Implement lazy/incremental fetching for columnar results when using Databricks JDBC in Thrift mode without Arrow support. The change modifies the behavior from buffering entire result sets in memory to maintaining only a limited number of rows at a time, reducing peak heap memory usage and preventing OutOfMemory errors.
- Added new artifact `databricks-jdbc-thin` for thin jar with runtime dependency metadata
- Added new artifact `databricks-jdbc-thin` for thin jar with runtime dependency metadata.
- Introduce a memory-efficient columnar data access mechanism for JDBC result processing.

### Updated
- Databricks SDK dependency upgraded to latest version 0.60.0
Expand Down
159 changes: 159 additions & 0 deletions src/main/java/com/databricks/jdbc/api/impl/ColumnarRowView.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package com.databricks.jdbc.api.impl;

import com.databricks.jdbc.exception.DatabricksSQLException;
import com.databricks.jdbc.model.client.thrift.generated.TColumn;
import com.databricks.jdbc.model.client.thrift.generated.TRowSet;
import com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode;
import java.util.BitSet;
import java.util.List;

/**
* Memory-efficient columnar view that provides row-based access without materializing all rows.
* Instead of creating List<List<Object>>, this class provides direct access to columnar data on a
* per-row, per-column basis, significantly reducing memory allocations.
*/
public class ColumnarRowView {
private final List<TColumn> columns;
private final int rowCount;
private final ColumnAccessor[] columnAccessors;

public ColumnarRowView(TRowSet rowSet) throws DatabricksSQLException {
this.columns = rowSet != null ? rowSet.getColumns() : null;

if (columns == null || columns.isEmpty()) {
this.rowCount = 0;
this.columnAccessors = new ColumnAccessor[0];
} else {
this.rowCount = getRowCountFromFirstColumn();
this.columnAccessors = new ColumnAccessor[columns.size()];
for (int i = 0; i < columns.size(); i++) {
this.columnAccessors[i] = createColumnAccessor(columns.get(i));
}
}
}

/** Gets the number of rows in this view. */
public int getRowCount() {
return rowCount;
}

/** Gets the number of columns in this view. */
public int getColumnCount() {
return columns != null ? columns.size() : 0;
}

/** Gets the value at the specified row and column without materializing the entire row. */
public Object getValue(int rowIndex, int columnIndex) throws DatabricksSQLException {
if (rowIndex < 0 || rowIndex >= rowCount) {
throw new DatabricksSQLException(
"Row index out of bounds: " + rowIndex, DatabricksDriverErrorCode.INVALID_STATE);
}
if (columnIndex < 0 || columnIndex >= columnAccessors.length) {
throw new DatabricksSQLException(
"Column index out of bounds: " + columnIndex, DatabricksDriverErrorCode.INVALID_STATE);
}

return columnAccessors[columnIndex].getValue(rowIndex);
}

/**
* Creates a materialized row only when explicitly requested (for backward compatibility). This
* should be avoided in performance-critical paths.
*/
public Object[] materializeRow(int rowIndex) throws DatabricksSQLException {
if (rowIndex < 0 || rowIndex >= rowCount) {
throw new DatabricksSQLException(
"Row index out of bounds: " + rowIndex, DatabricksDriverErrorCode.INVALID_STATE);
}

Object[] row = new Object[columnAccessors.length];
for (int col = 0; col < columnAccessors.length; col++) {
row[col] = columnAccessors[col].getValue(rowIndex);
}
return row;
}

private int getRowCountFromFirstColumn() throws DatabricksSQLException {
if (columns.isEmpty()) {
return 0;
}
TColumn firstColumn = columns.get(0);
return getColumnSize(firstColumn);
}

private static int getColumnSize(TColumn column) throws DatabricksSQLException {
if (column.isSetBinaryVal()) return column.getBinaryVal().getValuesSize();
if (column.isSetBoolVal()) return column.getBoolVal().getValuesSize();
if (column.isSetByteVal()) return column.getByteVal().getValuesSize();
if (column.isSetDoubleVal()) return column.getDoubleVal().getValuesSize();
if (column.isSetI16Val()) return column.getI16Val().getValuesSize();
if (column.isSetI32Val()) return column.getI32Val().getValuesSize();
if (column.isSetI64Val()) return column.getI64Val().getValuesSize();
if (column.isSetStringVal()) return column.getStringVal().getValuesSize();

throw new DatabricksSQLException(
"Unsupported column type: " + column, DatabricksDriverErrorCode.UNSUPPORTED_OPERATION);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about complex datatypes? Will they also be covered in above primitive types?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only support these

private static List<?> getColumnValues(TColumn column) throws DatabricksSQLException {
columns. There is nothing new added or removed in these changes. If complex types come as binary (which i think is the case), complex types are supported. Otherwise, not and this is the current behaviour too.

}

private static ColumnAccessor createColumnAccessor(TColumn column) throws DatabricksSQLException {
if (column.isSetBinaryVal()) {
return new TypedColumnAccessor<>(
column.getBinaryVal().getValues(), column.getBinaryVal().getNulls());
}
if (column.isSetBoolVal()) {
return new TypedColumnAccessor<>(
column.getBoolVal().getValues(), column.getBoolVal().getNulls());
}
if (column.isSetByteVal()) {
return new TypedColumnAccessor<>(
column.getByteVal().getValues(), column.getByteVal().getNulls());
}
if (column.isSetDoubleVal()) {
return new TypedColumnAccessor<>(
column.getDoubleVal().getValues(), column.getDoubleVal().getNulls());
}
if (column.isSetI16Val()) {
return new TypedColumnAccessor<>(
column.getI16Val().getValues(), column.getI16Val().getNulls());
}
if (column.isSetI32Val()) {
return new TypedColumnAccessor<>(
column.getI32Val().getValues(), column.getI32Val().getNulls());
}
if (column.isSetI64Val()) {
return new TypedColumnAccessor<>(
column.getI64Val().getValues(), column.getI64Val().getNulls());
}
if (column.isSetStringVal()) {
return new TypedColumnAccessor<>(
column.getStringVal().getValues(), column.getStringVal().getNulls());
}

throw new DatabricksSQLException(
"Unsupported column type: " + column, DatabricksDriverErrorCode.UNSUPPORTED_OPERATION);
}

/** Interface for accessing column values by index without materializing the entire column. */
private interface ColumnAccessor {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use separate files for interface and impl

Object getValue(int rowIndex);
}

/** Memory-efficient column accessor that handles nulls and provides direct index-based access. */
private static class TypedColumnAccessor<T> implements ColumnAccessor {
private final List<T> values;
private final BitSet nullBits;

public TypedColumnAccessor(List<T> values, byte[] nulls) {
this.values = values;
this.nullBits = nulls != null ? BitSet.valueOf(nulls) : null;
}

@Override
public Object getValue(int rowIndex) {
if (nullBits != null && nullBits.get(rowIndex)) {
return null;
Comment on lines +144 to +154
Copy link

Copilot AI Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Creating a new BitSet for every column accessor could be memory-intensive for large datasets. Consider lazy initialization or caching the BitSet creation to improve memory efficiency.

Suggested change
private final BitSet nullBits;
public TypedColumnAccessor(List<T> values, byte[] nulls) {
this.values = values;
this.nullBits = nulls != null ? BitSet.valueOf(nulls) : null;
}
@Override
public Object getValue(int rowIndex) {
if (nullBits != null && nullBits.get(rowIndex)) {
return null;
private final byte[] nulls;
private BitSet nullBits;
public TypedColumnAccessor(List<T> values, byte[] nulls) {
this.values = values;
this.nulls = nulls;
this.nullBits = null; // Lazy initialization
}
@Override
public Object getValue(int rowIndex) {
if (nulls != null) {
if (nullBits == null) {
nullBits = BitSet.valueOf(nulls);
}
if (nullBits.get(rowIndex)) {
return null;
}

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice suggestion. Thanks. Will implement in subsequent PR.

}
return values.get(rowIndex);
}
}
}
41 changes: 18 additions & 23 deletions src/main/java/com/databricks/jdbc/api/impl/LazyThriftResult.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.databricks.jdbc.api.impl;

import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_RESULT_ROW_LIMIT;
import static com.databricks.jdbc.common.util.DatabricksThriftUtil.extractRowsFromColumnar;
import static com.databricks.jdbc.common.util.DatabricksThriftUtil.createColumnarView;

import com.databricks.jdbc.api.internal.IDatabricksSession;
import com.databricks.jdbc.api.internal.IDatabricksStatementInternal;
Expand All @@ -10,13 +10,12 @@
import com.databricks.jdbc.log.JdbcLoggerFactory;
import com.databricks.jdbc.model.client.thrift.generated.TFetchResultsResp;
import com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode;
import java.util.List;

public class LazyThriftResult implements IExecutionResult {
private static final JdbcLogger LOGGER = JdbcLoggerFactory.getLogger(LazyThriftResult.class);

private TFetchResultsResp currentResponse;
private List<List<Object>> currentBatch;
private ColumnarRowView currentBatch;
private int currentBatchIndex;
private long globalRowIndex;
private final IDatabricksSession session;
Expand Down Expand Up @@ -53,7 +52,7 @@ public LazyThriftResult(
loadCurrentBatch();
LOGGER.debug(
"LazyThriftResult initialized with {} rows in first batch, hasMoreRows: {}",
currentBatch.size(),
currentBatch.getRowCount(),
currentResponse.hasMoreRows);
}

Expand All @@ -75,16 +74,15 @@ public Object getObject(int columnIndex) throws DatabricksSQLException {
throw new DatabricksSQLException(
"Cursor is before first row", DatabricksDriverErrorCode.INVALID_STATE);
}
if (currentBatchIndex < 0 || currentBatchIndex >= currentBatch.size()) {
if (currentBatchIndex < 0 || currentBatchIndex >= currentBatch.getRowCount()) {
throw new DatabricksSQLException(
"Invalid cursor position", DatabricksDriverErrorCode.INVALID_STATE);
}
List<Object> currentRowData = currentBatch.get(currentBatchIndex);
if (columnIndex < 0 || columnIndex >= currentRowData.size()) {
if (columnIndex < 0 || columnIndex >= currentBatch.getColumnCount()) {
throw new DatabricksSQLException(
"Column index out of bounds " + columnIndex, DatabricksDriverErrorCode.INVALID_STATE);
}
return currentRowData.get(columnIndex);
return currentBatch.getValue(currentBatchIndex, columnIndex);
}

/**
Expand Down Expand Up @@ -128,13 +126,13 @@ public boolean next() throws DatabricksSQLException {
globalRowIndex++;

// Check if we need to fetch the next batch
if (currentBatchIndex >= currentBatch.size()) {
if (currentBatchIndex >= currentBatch.getRowCount()) {
// Keep fetching until we get a non-empty batch or no more rows
while (currentResponse.hasMoreRows) {
fetchNextBatch();

// If we got a non-empty batch, we can proceed
if (!currentBatch.isEmpty()) {
if (currentBatch.getRowCount() > 0) {
currentBatchIndex = 0; // Reset to first row of new batch
break;
}
Expand All @@ -143,7 +141,7 @@ public boolean next() throws DatabricksSQLException {
}

// If we exited the loop and still have an empty batch, we've reached the end
if (currentBatch.isEmpty()) {
if (currentBatch.getRowCount() == 0) {
hasReachedEnd = true;
globalRowIndex--; // Revert the increment since we didn't actually move to a new row
return false;
Expand Down Expand Up @@ -171,7 +169,7 @@ public boolean hasNext() {
}

// Check if there are more rows in current batch
if (currentBatchIndex + 1 < currentBatch.size()) {
if (currentBatchIndex + 1 < currentBatch.getRowCount()) {
return true;
}

Expand All @@ -196,7 +194,7 @@ public void close() {
@Override
public long getRowCount() {
// Return the number of rows in the current batch
return currentBatch != null ? currentBatch.size() : 0;
return currentBatch != null ? currentBatch.getRowCount() : 0;
}

/**
Expand All @@ -210,21 +208,18 @@ public long getChunkCount() {
return 0;
}

/**
* Loads the current response data into memory as a batch of rows.
*
* @throws DatabricksSQLException if the response data cannot be processed
*/
private void loadCurrentBatch() throws DatabricksSQLException {
currentBatch = extractRowsFromColumnar(currentResponse.getResults());
currentBatch = createColumnarView(currentResponse.getResults());
currentBatchIndex = -1; // Reset batch index
totalRowsFetched += currentBatch.size();
totalRowsFetched += currentBatch.getRowCount();
LOGGER.debug(
"Loaded batch with {} rows, total fetched: {}", currentBatch.size(), totalRowsFetched);
"Loaded batch with {} rows, total fetched: {}",
currentBatch.getRowCount(),
totalRowsFetched);
}

/**
* Fetches the next batch of data from the server and loads it into memory.
* Fetches the next batch of data from the server and creates columnar views.
*
* @throws DatabricksSQLException if the fetch operation fails
*/
Expand All @@ -236,7 +231,7 @@ private void fetchNextBatch() throws DatabricksSQLException {

LOGGER.debug(
"Fetched batch with {} rows, hasMoreRows: {}",
currentBatch.size(),
currentBatch.getRowCount(),
currentResponse.hasMoreRows);
} catch (DatabricksSQLException e) {
LOGGER.error("Failed to fetch next batch: {}", e.getMessage());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import static com.databricks.jdbc.common.util.DatabricksTypeUtil.*;
import static com.databricks.jdbc.model.client.thrift.generated.TTypeId.*;

import com.databricks.jdbc.api.impl.ColumnarRowView;
import com.databricks.jdbc.api.internal.IDatabricksSession;
import com.databricks.jdbc.api.internal.IDatabricksStatementInternal;
import com.databricks.jdbc.common.DatabricksJdbcConstants;
Expand Down Expand Up @@ -125,6 +126,17 @@ public static List<List<Object>> extractRowsFromColumnar(TRowSet rowSet)
return rows;
}

/**
* Memory-efficient alternative that creates a columnar view instead of materializing all rows.
* Use this method to significantly reduce memory allocations when accessing row data.
*
* @param rowSet that contains columnar data
* @return a columnar view that provides row-based access without full materialization
*/
public static ColumnarRowView createColumnarView(TRowSet rowSet) throws DatabricksSQLException {
return new ColumnarRowView(rowSet);
}

/** Returns statement status for given operation status response */
public static StatementStatus getStatementStatus(TGetOperationStatusResp resp) {
StatementState state = null;
Expand Down
Loading
Loading