databricks-jdbc/src/main/java/com/databricks/jdbc/api/impl/arrow/StreamingInlineArrowResult.java at e23ef7ef09152de094c0106d831e60181f93e83b · gopalldb/databricks-jdbc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
package com.databricks.jdbc.api.impl.arrow;

import static com.databricks.jdbc.common.EnvironmentVariables.DEFAULT_STREAMING_BATCH_TIMEOUT_SECONDS;
import static com.databricks.jdbc.common.util.ArrowUtil.getColumnInfoList;

import com.databricks.jdbc.api.impl.IExecutionResult;
import com.databricks.jdbc.api.impl.streaming.StreamingBatch;
import com.databricks.jdbc.api.impl.streaming.ThriftStreamingProvider;
import com.databricks.jdbc.api.impl.thrift.ThriftBatchFetcher;
import com.databricks.jdbc.api.impl.thrift.ThriftBatchFetcherImpl;
import com.databricks.jdbc.api.internal.IDatabricksSession;
import com.databricks.jdbc.api.internal.IDatabricksStatementInternal;
import com.databricks.jdbc.exception.DatabricksSQLException;
import com.databricks.jdbc.log.JdbcLogger;
import com.databricks.jdbc.log.JdbcLoggerFactory;
import com.databricks.jdbc.model.client.thrift.generated.TFetchResultsResp;
import com.databricks.jdbc.model.core.ColumnInfo;
import com.databricks.jdbc.model.core.ColumnInfoTypeName;
import com.databricks.jdbc.model.telemetry.enums.DatabricksDriverErrorCode;
import java.util.List;

/**
 * High-throughput streaming implementation for inline Arrow results.
 *
 * <p>Uses {@link ThriftStreamingProvider} for proactive batch prefetching, achieving throughput
 * comparable to eager loading while maintaining the memory benefits of lazy loading.
 *
 * <p>Key features:
 *
 * <ul>
 *   <li>Background prefetch thread fetches Arrow batches ahead of consumption
 *   <li>Sliding window limits memory usage to a configurable number of batches
 *   <li>Non-blocking iteration when prefetch keeps up with consumption
 *   <li>Automatic native memory cleanup via type-safe release actions
 *   <li>Type-safe: Uses generic {@code ThriftStreamingProvider<ArrowResultChunk>}
 * </ul>
 *
 * <p>This implementation replaces {@code LazyThriftInlineArrowResult} for improved throughput.
 */
public class StreamingInlineArrowResult implements IExecutionResult {

  private static final JdbcLogger LOGGER =
      JdbcLoggerFactory.getLogger(StreamingInlineArrowResult.class);

  // Streaming infrastructure - type-safe generic provider
  private final ThriftStreamingProvider<ArrowResultChunk> provider;
  private final IDatabricksSession session;

  // Current position
  private StreamingBatch<ArrowResultChunk> currentBatch;
  private ArrowResultChunkIterator currentChunkIterator;
  private long globalRowIndex;

  // Metadata
  private List<ColumnInfo> columnInfos;

  // State
  private boolean hasReachedEnd;
  private volatile boolean isClosed;

  /**
   * Creates a new StreamingInlineArrowResult.
   *
   * <p>Configuration values (maxBatchesInMemory, timeout) are read from the session's connection
   * context.
   *
   * @param initialResponse The initial Thrift response containing the first Arrow batch
   * @param statement The statement that generated this result
   * @param session The session for fetching additional batches
   * @throws DatabricksSQLException if initialization fails
   */
  public StreamingInlineArrowResult(
      TFetchResultsResp initialResponse,
      IDatabricksStatementInternal statement,
      IDatabricksSession session)
      throws DatabricksSQLException {

    this.session = session;
    this.globalRowIndex = -1;
    this.hasReachedEnd = false;
    this.isClosed = false;

    // Initialize column info from metadata
    this.columnInfos = getColumnInfoList(initialResponse.getResultSetMetadata());

    // Create batch fetcher and type-safe generic provider for Arrow
    ThriftBatchFetcher fetcher = new ThriftBatchFetcherImpl(session, statement);
    this.provider =
        ThriftStreamingProvider.forInlineArrow(
            fetcher,
            initialResponse,
            statement != null ? statement.getStatementId() : null,
            session.getConnectionContext().getThriftMaxBatchesInMemory(),
            DEFAULT_STREAMING_BATCH_TIMEOUT_SECONDS);

    // Move to first batch (check nextBatch() return value to handle empty initial batches)
    if (provider.nextBatch()) {
      currentBatch = provider.getCurrentBatch();
      // Type-safe: getData() returns ArrowResultChunk directly!
      currentChunkIterator = currentBatch.getData().getChunkIterator();
    }

    LOGGER.debug(
        "StreamingInlineArrowResult initialized - firstBatchRows={}, maxBatchesInMemory={}",
        currentBatch != null ? currentBatch.getRowCount() : 0,
        session.getConnectionContext().getThriftMaxBatchesInMemory());
  }

  /**
   * Gets the value at the specified column index for the current row.
   *
   * @param columnIndex the zero-based column index
   * @return the value at the specified column
   * @throws DatabricksSQLException if access fails
   */
  @Override
  public Object getObject(int columnIndex) throws DatabricksSQLException {
    validateGetObjectState(columnIndex);

    ColumnInfo columnInfo = columnInfos.get(columnIndex);
    ColumnInfoTypeName requiredType = columnInfo.getTypeName();
    String arrowMetadata = currentChunkIterator.getType(columnIndex);
    if (arrowMetadata == null) {
      arrowMetadata = columnInfo.getTypeText();
    }

    // Use shared complex type handling from ArrowStreamResult
    return ArrowStreamResult.getObjectWithComplexTypeHandling(
        session, currentChunkIterator, columnIndex, requiredType, arrowMetadata, columnInfo);
  }

  /** Validates state before getting an object. */
  private void validateGetObjectState(int columnIndex) throws DatabricksSQLException {
    if (isClosed) {
      LOGGER.error("Attempted to access closed result");
      throw new DatabricksSQLException(
          "Result is closed", DatabricksDriverErrorCode.STATEMENT_CLOSED);
    }
    if (globalRowIndex == -1) {
      LOGGER.error("Attempted to access data before first row");
      throw new DatabricksSQLException(
          "Cursor is before first row", DatabricksDriverErrorCode.INVALID_STATE);
    }
    if (currentChunkIterator == null) {
      LOGGER.error("No current chunk available at row {}", globalRowIndex);
      throw new DatabricksSQLException(
          "No current chunk available", DatabricksDriverErrorCode.INVALID_STATE);
    }
    if (columnIndex < 0 || columnIndex >= columnInfos.size()) {
      LOGGER.error("Column index {} out of bounds (0-{})", columnIndex, columnInfos.size() - 1);
      throw new DatabricksSQLException(
          "Column index out of bounds: " + columnIndex, DatabricksDriverErrorCode.INVALID_STATE);
    }
  }

  /**
   * Gets the current row index (0-based). Returns -1 if before the first row.
   *
   * @return the current row index
   */
  @Override
  public long getCurrentRow() {
    return globalRowIndex;
  }

  /**
   * Moves the cursor to the next row. Fetches additional batches from server if needed.
   *
   * @return true if there is a next row, false if at the end
   * @throws DatabricksSQLException if an error occurs
   */
  @Override
  public boolean next() throws DatabricksSQLException {
    if (isClosed || hasReachedEnd) {
      return false;
    }

    if (!hasNext()) {
      return false;
    }

    globalRowIndex++;

    // Try to move to next row in current chunk
    if (currentChunkIterator != null && currentChunkIterator.hasNextRow()) {
      currentChunkIterator.nextRow();
      return true;
    }

    // Need to move to next batch
    if (provider.hasNextBatch()) {
      provider.nextBatch();
      currentBatch = provider.getCurrentBatch();

      if (currentBatch != null) {
        // Type-safe: getData() returns ArrowResultChunk directly!
        ArrowResultChunk chunk = currentBatch.getData();
        if (chunk == null) {
          LOGGER.warn("Batch {} has null data", currentBatch.getBatchIndex());
          hasReachedEnd = true;
          globalRowIndex--;
          return false;
        }
        currentChunkIterator = chunk.getChunkIterator();
        currentChunkIterator.nextRow();

        LOGGER.debug(
            "Moved to batch {} - globalRow={}, batchesInMemory={}",
            currentBatch.getBatchIndex(),
            globalRowIndex,
            provider.getBatchesInMemory());

        return true;
      }
    }

    // No more data
    hasReachedEnd = true;
    globalRowIndex--;
    return false;
  }

  /**
   * Checks if there are more rows available without advancing the cursor.
   *
   * @return true if there are more rows, false otherwise
   */
  @Override
  public boolean hasNext() {
    if (isClosed || hasReachedEnd) {
      return false;
    }

    // Check current chunk
    if (currentChunkIterator != null && currentChunkIterator.hasNextRow()) {
      return true;
    }

    // Check if more batches available
    return provider.hasNextBatch();
  }

  /** Closes this result and releases associated resources. */
  @Override
  public void close() {
    if (isClosed) {
      return;
    }

    long totalRows = provider.getTotalRowsFetched();
    isClosed = true;
    currentBatch = null;
    currentChunkIterator = null;

    // Provider will release all Arrow chunks using the type-safe Consumer<ArrowResultChunk>
    provider.close();

    LOGGER.debug("Closed - totalRowsFetched={}, rowsConsumed={}", totalRows, globalRowIndex + 1);
  }

  /**
   * Gets the number of rows in the current batch.
   *
   * @return the number of rows in the current batch
   */
  @Override
  public long getRowCount() {
    return currentBatch != null ? currentBatch.getRowCount() : 0;
  }

  /**
   * Gets the chunk count. Always returns 0 for streaming results.
   *
   * @return 0
   */
  @Override
  public long getChunkCount() {
    return 0;
  }

  /**
   * Gets the Arrow metadata for the current chunk.
   *
   * @return list of arrow metadata strings, or null if no chunk is loaded
   * @throws DatabricksSQLException if an error occurs
   */
  public List<String> getArrowMetadata() throws DatabricksSQLException {
    if (currentBatch == null) {
      return null;
    }
    ArrowResultChunk chunk = currentBatch.getData();
    return chunk != null ? chunk.getArrowMetadata() : null;
  }

  /**
   * Gets the total number of rows fetched from the server so far.
   *
   * @return the total rows fetched
   */
  public long getTotalRowsFetched() {
    return provider.getTotalRowsFetched();
  }

  /**
   * Checks if all data has been fetched from the server.
   *
   * @return true if end of stream reached
   */
  public boolean isCompletelyFetched() {
    return hasReachedEnd || provider.isEndOfStreamReached();
  }

  /**
   * Gets the number of batches currently in memory.
   *
   * @return the batch count in memory
   */
  public int getBatchesInMemory() {
    return provider.getBatchesInMemory();
  }
}