[ES-1804970] Fix CloudFetch returning stale column names from cached results (#351)

sreekanth-db · web-flow · commit 3c0f7e41f3ba · 2026-04-21T09:29:25.000Z
## Summary

Fixes a bug where `arrow.Record.Schema()` returns stale column aliases
when CloudFetch serves cached Arrow IPC files from a structurally
identical prior query with different `AS` aliases.

- **Root cause:** `NewCloudBatchIterator` was not receiving the
authoritative schema bytes from `GetResultSetMetadata`, unlike the local
batch path which already had this. CloudFetch Arrow IPC files have
column names baked in from the original query, and the driver was
reading them as-is.
- **Fix:** Pass `arrowSchemaBytes` (the authoritative schema from
`GetResultSetMetadata`) into `NewCloudBatchIterator`. After records are
deserialized from the IPC stream, replace the stale schema with the
authoritative one using `array.NewRecord()` (zero-copy — shares
underlying column data, only swaps metadata).

## Changes

- **`arrowRecordIterator.go`** — Pass `ri.arrowSchemaBytes` to
`NewCloudBatchIterator` in `newBatchIterator()`
- **`arrowRows.go`** — Pass `schemaBytes` to `NewCloudBatchIterator` in
`NewArrowRowScanner()`
- **`batchloader.go`** — Core fix:
- `NewCloudBatchIterator` accepts `arrowSchemaBytes`, parses into
`*arrow.Schema`, stores on `batchIterator`
- `batchIterator.Next()` applies override schema to CloudFetch records
only (local path is untouched, `overrideSchema` is `nil`)
  - Added `schemaFromIPCBytes()` helper
  - Field count validation guard to prevent panics on schema mismatch
  - Schema parse failure logged at `Warn` level
- **`batchloader_test.go`** — Added `TestCloudFetchSchemaOverride` with
two subtests:
- Verifies stale column names `["id","name"]` are overridden to
`["x","y"]`
  - Verifies `nil` schema bytes pass through original names unchanged

## Who is affected

Go driver users with CloudFetch enabled (`WithCloudFetch(true)`) who
read `arrow.Record.Schema()` directly. Python, ODBC, and JDBC drivers
are not affected.

## Test plan

- [x] All existing unit tests pass (37 tests in
`internal/rows/arrowbased/`)
- [x] New unit test `TestCloudFetchSchemaOverride` covers the override
and no-override paths
- [x] Verified end-to-end against a real Databricks warehouse using
`samples.tpch.lineitem` (~30M rows) with two queries differing only in
column aliases — confirmed `arrow.Record.Schema()` now returns correct
aliases

This pull request was AI-assisted by Isaac.

---------

Signed-off-by: Sreekanth Vadigi &lt;sreekanth.vadigi@databricks.com&gt;
diff --git a/internal/rows/arrowbased/arrowRecordIterator.go b/internal/rows/arrowbased/arrowRecordIterator.go
@@ -169,7 +169,7 @@ func (ri *arrowRecordIterator) getBatchIterator() error {
 func (ri *arrowRecordIterator) newBatchIterator(fr *cli_service.TFetchResultsResp) (BatchIterator, error) {
 	rowSet := fr.Results
 	if len(rowSet.ResultLinks) > 0 {
-		return NewCloudBatchIterator(ri.ctx, rowSet.ResultLinks, rowSet.StartRowOffset, &ri.cfg, nil)
+		return NewCloudBatchIterator(ri.ctx, rowSet.ResultLinks, rowSet.StartRowOffset, ri.arrowSchemaBytes, &ri.cfg, nil)
 	} else {
 		return NewLocalBatchIterator(ri.ctx, rowSet.ArrowBatches, rowSet.StartRowOffset, ri.arrowSchemaBytes, &ri.cfg)
 	}
diff --git a/internal/rows/arrowbased/arrowRows.go b/internal/rows/arrowbased/arrowRows.go
@@ -121,7 +121,7 @@ func NewArrowRowScanner(resultSetMetadata *cli_service.TGetResultSetMetadataResp
 		for _, resultLink := range rowSet.ResultLinks {
 			logger.Debug().Msgf("- start row offset: %d, row count: %d", resultLink.StartRowOffset, resultLink.RowCount)
 		}
-		bi, err2 = NewCloudBatchIterator(context.Background(), rowSet.ResultLinks, rowSet.StartRowOffset, cfg, onCloudFetchDownload)
+		bi, err2 = NewCloudBatchIterator(context.Background(), rowSet.ResultLinks, rowSet.StartRowOffset, schemaBytes, cfg, onCloudFetchDownload)
 	} else {
 		bi, err2 = NewLocalBatchIterator(context.Background(), rowSet.ArrowBatches, rowSet.StartRowOffset, schemaBytes, cfg)
 	}
diff --git a/internal/rows/arrowbased/batchloader.go b/internal/rows/arrowbased/batchloader.go
@@ -15,6 +15,8 @@ import (
 
 	"net/http"
 
+	"github.com/apache/arrow/go/v12/arrow"
+	"github.com/apache/arrow/go/v12/arrow/array"
 	"github.com/apache/arrow/go/v12/arrow/ipc"
 	dbsqlerr "github.com/databricks/databricks-sql-go/errors"
 	"github.com/databricks/databricks-sql-go/internal/cli_service"
@@ -57,19 +59,36 @@ func NewCloudIPCStreamIterator(
 	return bi, nil
 }
 
-// NewCloudBatchIterator creates a cloud-based BatchIterator for backward compatibility
+// NewCloudBatchIterator creates a cloud-based BatchIterator for backward compatibility.
+// arrowSchemaBytes is the authoritative schema from GetResultSetMetadata, used to
+// override stale column names in cached Arrow IPC files.
 func NewCloudBatchIterator(
 	ctx context.Context,
 	files []*cli_service.TSparkArrowResultLink,
 	startRowOffset int64,
+	arrowSchemaBytes []byte,
 	cfg *config.Config,
 	onFileDownloaded func(downloadMs int64),
 ) (BatchIterator, dbsqlerr.DBError) {
 	ipcIterator, err := NewCloudIPCStreamIterator(ctx, files, startRowOffset, cfg, onFileDownloaded)
 	if err != nil {
 		return nil, err
 	}
-	return NewBatchIterator(ipcIterator, startRowOffset), nil
+
+	var overrideSchema *arrow.Schema
+	if len(arrowSchemaBytes) > 0 {
+		var schemaErr error
+		overrideSchema, schemaErr = schemaFromIPCBytes(arrowSchemaBytes)
+		if schemaErr != nil {
+			logger.Warn().Msgf("CloudFetch: failed to parse override schema: %v", schemaErr)
+		}
+	}
+
+	return &batchIterator{
+		ipcIterator:    ipcIterator,
+		startRowOffset: startRowOffset,
+		overrideSchema: overrideSchema,
+	}, nil
 }
 
 func NewLocalIPCStreamIterator(
@@ -416,6 +435,7 @@ type BatchIterator interface {
 type batchIterator struct {
 	ipcIterator    IPCStreamIterator
 	startRowOffset int64
+	overrideSchema *arrow.Schema // authoritative schema to fix stale CloudFetch column names
 }
 
 // NewBatchIterator creates a BatchIterator from an IPCStreamIterator
@@ -437,6 +457,24 @@ func (bi *batchIterator) Next() (SparkArrowBatch, error) {
 		return nil, err
 	}
 
+	// When using CloudFetch, cached Arrow IPC files may contain stale column
+	// names from a previous query. Replace the embedded schema with the
+	// authoritative schema from GetResultSetMetadata.
+	if bi.overrideSchema != nil && len(records) > 0 && len(bi.overrideSchema.Fields()) == len(records[0].Columns()) {
+		for i, rec := range records {
+			sar, ok := rec.(*sparkArrowRecord)
+			if !ok {
+				continue
+			}
+			corrected := array.NewRecord(bi.overrideSchema, sar.Columns(), sar.NumRows())
+			sar.Release()
+			records[i] = &sparkArrowRecord{
+				Delimiter: sar.Delimiter,
+				Record:    corrected,
+			}
+		}
+	}
+
 	// Calculate total rows in this batch
 	totalRows := int64(0)
 	for _, record := range records {
@@ -459,3 +497,13 @@ func (bi *batchIterator) HasNext() bool {
 func (bi *batchIterator) Close() {
 	bi.ipcIterator.Close()
 }
+
+// schemaFromIPCBytes parses Arrow schema bytes (IPC format) into an *arrow.Schema.
+func schemaFromIPCBytes(schemaBytes []byte) (*arrow.Schema, error) {
+	reader, err := ipc.NewReader(bytes.NewReader(schemaBytes))
+	if err != nil {
+		return nil, err
+	}
+	defer reader.Release()
+	return reader.Schema(), nil
+}
diff --git a/internal/rows/arrowbased/batchloader_test.go b/internal/rows/arrowbased/batchloader_test.go
@@ -77,6 +77,7 @@ func TestCloudFetchIterator(t *testing.T) {
 			context.Background(),
 			links,
 			startRowOffset,
+			nil,
 			cfg,
 			nil,
 		)
@@ -152,6 +153,7 @@ func TestCloudFetchIterator(t *testing.T) {
 			context.Background(),
 			links,
 			startRowOffset,
+			nil,
 			cfg,
 			nil,
 		)
@@ -211,6 +213,7 @@ func TestCloudFetchIterator(t *testing.T) {
 			context.Background(),
 			links,
 			startRowOffset,
+			nil,
 			cfg,
 			nil,
 		)
@@ -286,6 +289,7 @@ func TestCloudFetchIterator(t *testing.T) {
 				RowCount:       1,
 			}},
 			startRowOffset,
+			nil,
 			cfg,
 			nil,
 		)
@@ -325,6 +329,7 @@ func TestCloudFetchIterator(t *testing.T) {
 				RowCount:       1,
 			}},
 			startRowOffset,
+			nil,
 			cfg,
 			nil,
 		)
@@ -340,6 +345,119 @@ func TestCloudFetchIterator(t *testing.T) {
 	})
 }
 
+func TestCloudFetchSchemaOverride(t *testing.T) {
+	// Reproduces ES-1804970: When the server result cache serves Arrow IPC files
+	// from a prior query, the embedded schema has stale column names. The
+	// authoritative schema from GetResultSetMetadata must override them.
+
+	// IPC data has columns ["id", "name"] (stale, from cached query)
+	staleRecord := generateArrowRecord()
+	staleIPCBytes := generateMockArrowBytes(staleRecord)
+
+	// Authoritative schema has columns ["x", "y"] (correct, from GetResultSetMetadata)
+	correctFields := []arrow.Field{
+		{Name: "x", Type: arrow.PrimitiveTypes.Int32},
+		{Name: "y", Type: arrow.BinaryTypes.String},
+	}
+	correctSchema := arrow.NewSchema(correctFields, nil)
+	var schemaBuf bytes.Buffer
+	schemaWriter := ipc.NewWriter(&schemaBuf, ipc.WithSchema(correctSchema))
+	if err := schemaWriter.Close(); err != nil {
+		t.Fatal(err)
+	}
+	correctSchemaBytes := schemaBuf.Bytes()
+
+	// Serve stale IPC data via mock HTTP
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		_, err := w.Write(staleIPCBytes)
+		if err != nil {
+			panic(err)
+		}
+	}))
+	defer server.Close()
+
+	t.Run("should override stale column names with authoritative schema", func(t *testing.T) {
+		links := []*cli_service.TSparkArrowResultLink{
+			{
+				FileLink:       server.URL,
+				ExpiryTime:     time.Now().Add(10 * time.Minute).Unix(),
+				StartRowOffset: 0,
+				RowCount:       3,
+			},
+		}
+
+		cfg := config.WithDefaults()
+		cfg.UseLz4Compression = false
+		cfg.MaxDownloadThreads = 1
+
+		bi, err := NewCloudBatchIterator(
+			context.Background(),
+			links,
+			0,
+			correctSchemaBytes,
+			cfg,
+			nil,
+		)
+		assert.Nil(t, err)
+
+		batch, batchErr := bi.Next()
+		assert.Nil(t, batchErr)
+		assert.NotNil(t, batch)
+
+		rec, recErr := batch.Next()
+		assert.Nil(t, recErr)
+		assert.NotNil(t, rec)
+
+		// The record schema must use the authoritative names, not the stale ones
+		assert.Equal(t, "x", rec.Schema().Field(0).Name)
+		assert.Equal(t, "y", rec.Schema().Field(1).Name)
+
+		// Data must be preserved
+		assert.Equal(t, int64(3), rec.NumRows())
+		assert.Equal(t, 2, len(rec.Schema().Fields()))
+
+		rec.Release()
+	})
+
+	t.Run("should pass through unchanged when no override schema provided", func(t *testing.T) {
+		links := []*cli_service.TSparkArrowResultLink{
+			{
+				FileLink:       server.URL,
+				ExpiryTime:     time.Now().Add(10 * time.Minute).Unix(),
+				StartRowOffset: 0,
+				RowCount:       3,
+			},
+		}
+
+		cfg := config.WithDefaults()
+		cfg.UseLz4Compression = false
+		cfg.MaxDownloadThreads = 1
+
+		bi, err := NewCloudBatchIterator(
+			context.Background(),
+			links,
+			0,
+			nil,
+			cfg,
+			nil,
+		)
+		assert.Nil(t, err)
+
+		batch, batchErr := bi.Next()
+		assert.Nil(t, batchErr)
+
+		rec, recErr := batch.Next()
+		assert.Nil(t, recErr)
+
+		// Without override, the original (stale) column names are preserved
+		assert.Equal(t, "id", rec.Schema().Field(0).Name)
+		assert.Equal(t, "name", rec.Schema().Field(1).Name)
+
+		rec.Release()
+	})
+}
+
 // TestCloudFetchIterator_OnFileDownloaded_CallbackInvokedWithPositiveDuration verifies
 // that the onFileDownloaded telemetry callback is called once per downloaded S3 file with
 // a positive downloadMs value.
@@ -388,7 +506,7 @@ func TestCloudFetchIterator_OnFileDownloaded_CallbackInvokedWithPositiveDuration
 		callbackMu.Unlock()
 	}
 
-	bi, err := NewCloudBatchIterator(context.Background(), links, startRowOffset, cfg, onFileDownloaded)
+	bi, err := NewCloudBatchIterator(context.Background(), links, startRowOffset, nil, cfg, onFileDownloaded)
 	assert.Nil(t, err)
 
 	// Consume all batches to trigger the downloads.
@@ -439,7 +557,7 @@ func TestCloudFetchIterator_OnFileDownloaded_NilCallbackDoesNotPanic(t *testing.
 	cfg.MaxDownloadThreads = 1
 
 	// nil callback — must not panic
-	bi, err := NewCloudBatchIterator(context.Background(), links, startRowOffset, cfg, nil)
+	bi, err := NewCloudBatchIterator(context.Background(), links, startRowOffset, nil, cfg, nil)
 	assert.Nil(t, err)
 
 	assert.NotPanics(t, func() {

Original file line number	Diff line number	Diff line change
`@@ -169,7 +169,7 @@ func (ri *arrowRecordIterator) getBatchIterator() error {`
`169`	`169`	`func (ri arrowRecordIterator) newBatchIterator(fr cli_service.TFetchResultsResp) (BatchIterator, error) {`
`170`	`170`	`rowSet := fr.Results`
`171`	`171`	`if len(rowSet.ResultLinks) > 0 {`
`172`		`- return NewCloudBatchIterator(ri.ctx, rowSet.ResultLinks, rowSet.StartRowOffset, &ri.cfg, nil)`
	`172`	`+ return NewCloudBatchIterator(ri.ctx, rowSet.ResultLinks, rowSet.StartRowOffset, ri.arrowSchemaBytes, &ri.cfg, nil)`
`173`	`173`	`} else {`
`174`	`174`	`return NewLocalBatchIterator(ri.ctx, rowSet.ArrowBatches, rowSet.StartRowOffset, ri.arrowSchemaBytes, &ri.cfg)`
`175`	`175`	`}`
Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ func NewArrowRowScanner(resultSetMetadata *cli_service.TGetResultSetMetadataResp`
`121`	`121`	`for _, resultLink := range rowSet.ResultLinks {`
`122`	`122`	`logger.Debug().Msgf("- start row offset: %d, row count: %d", resultLink.StartRowOffset, resultLink.RowCount)`
`123`	`123`	`}`
`124`		`- bi, err2 = NewCloudBatchIterator(context.Background(), rowSet.ResultLinks, rowSet.StartRowOffset, cfg, onCloudFetchDownload)`
	`124`	`+ bi, err2 = NewCloudBatchIterator(context.Background(), rowSet.ResultLinks, rowSet.StartRowOffset, schemaBytes, cfg, onCloudFetchDownload)`
`125`	`125`	`} else {`
`126`	`126`	`bi, err2 = NewLocalBatchIterator(context.Background(), rowSet.ArrowBatches, rowSet.StartRowOffset, schemaBytes, cfg)`
`127`	`127`	`}`