|
| 1 | +package arrowbased |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "context" |
| 6 | + "io" |
| 7 | + |
| 8 | + "github.com/databricks/databricks-sql-go/internal/cli_service" |
| 9 | + "github.com/databricks/databricks-sql-go/internal/config" |
| 10 | + "github.com/databricks/databricks-sql-go/internal/rows/rowscanner" |
| 11 | + dbsqlrows "github.com/databricks/databricks-sql-go/rows" |
| 12 | + "github.com/pierrec/lz4/v4" |
| 13 | +) |
| 14 | + |
| 15 | +// ipcStreamIterator provides access to raw Arrow IPC streams without deserialization |
| 16 | +type ipcStreamIterator struct { |
| 17 | + ctx context.Context |
| 18 | + resultPageIterator rowscanner.ResultPageIterator |
| 19 | + currentBatches []*cli_service.TSparkArrowBatch |
| 20 | + currentIndex int |
| 21 | + arrowSchemaBytes []byte |
| 22 | + useLz4 bool |
| 23 | + hasMorePages bool |
| 24 | +} |
| 25 | + |
| 26 | +// NewIPCStreamIterator creates an iterator that returns raw IPC streams |
| 27 | +func NewIPCStreamIterator( |
| 28 | + ctx context.Context, |
| 29 | + resultPageIterator rowscanner.ResultPageIterator, |
| 30 | + initialRowSet *cli_service.TRowSet, |
| 31 | + schemaBytes []byte, |
| 32 | + cfg *config.Config, |
| 33 | +) (dbsqlrows.IPCStreamIterator, error) { |
| 34 | + var useLz4 bool |
| 35 | + if cfg != nil { |
| 36 | + useLz4 = cfg.UseLz4Compression |
| 37 | + } |
| 38 | + |
| 39 | + var batches []*cli_service.TSparkArrowBatch |
| 40 | + if initialRowSet != nil { |
| 41 | + batches = initialRowSet.ArrowBatches |
| 42 | + } |
| 43 | + |
| 44 | + return &ipcStreamIterator{ |
| 45 | + ctx: ctx, |
| 46 | + resultPageIterator: resultPageIterator, |
| 47 | + currentBatches: batches, |
| 48 | + currentIndex: 0, |
| 49 | + arrowSchemaBytes: schemaBytes, |
| 50 | + useLz4: useLz4, |
| 51 | + hasMorePages: resultPageIterator != nil && resultPageIterator.HasNext(), |
| 52 | + }, nil |
| 53 | +} |
| 54 | + |
| 55 | +// NextIPCStream returns the next Arrow batch as a raw IPC stream |
| 56 | +func (it *ipcStreamIterator) NextIPCStream() (io.Reader, error) { |
| 57 | + // Check if we need to load more batches from the next page |
| 58 | + if it.currentIndex >= len(it.currentBatches) { |
| 59 | + if !it.hasMorePages || it.resultPageIterator == nil { |
| 60 | + return nil, io.EOF |
| 61 | + } |
| 62 | + |
| 63 | + // Fetch next page |
| 64 | + fetchResult, err := it.resultPageIterator.Next() |
| 65 | + if err != nil { |
| 66 | + return nil, err |
| 67 | + } |
| 68 | + |
| 69 | + if fetchResult == nil || fetchResult.Results == nil || fetchResult.Results.ArrowBatches == nil { |
| 70 | + return nil, io.EOF |
| 71 | + } |
| 72 | + |
| 73 | + it.currentBatches = fetchResult.Results.ArrowBatches |
| 74 | + it.currentIndex = 0 |
| 75 | + it.hasMorePages = it.resultPageIterator.HasNext() |
| 76 | + |
| 77 | + // If no batches in this page, recurse to try next page |
| 78 | + if len(it.currentBatches) == 0 { |
| 79 | + return it.NextIPCStream() |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + batch := it.currentBatches[it.currentIndex] |
| 84 | + it.currentIndex++ |
| 85 | + |
| 86 | + // Create reader for the batch data |
| 87 | + var batchReader io.Reader = bytes.NewReader(batch.Batch) |
| 88 | + |
| 89 | + // Handle LZ4 decompression if needed |
| 90 | + if it.useLz4 { |
| 91 | + batchReader = lz4.NewReader(batchReader) |
| 92 | + } |
| 93 | + |
| 94 | + // Combine schema and batch data into a complete IPC stream |
| 95 | + // Arrow IPC format expects: [Schema][Batch1][Batch2]... |
| 96 | + return io.MultiReader( |
| 97 | + bytes.NewReader(it.arrowSchemaBytes), |
| 98 | + batchReader, |
| 99 | + ), nil |
| 100 | +} |
| 101 | + |
| 102 | +// HasNext returns true if there are more batches |
| 103 | +func (it *ipcStreamIterator) HasNext() bool { |
| 104 | + return it.currentIndex < len(it.currentBatches) || it.hasMorePages |
| 105 | +} |
| 106 | + |
| 107 | +// Close releases any resources |
| 108 | +func (it *ipcStreamIterator) Close() { |
| 109 | + // Nothing to close for this implementation |
| 110 | +} |
| 111 | + |
| 112 | +// GetSchemaBytes returns the Arrow schema in IPC format |
| 113 | +func (it *ipcStreamIterator) GetSchemaBytes() ([]byte, error) { |
| 114 | + return it.arrowSchemaBytes, nil |
| 115 | +} |
0 commit comments