redpanda-data
diff --git a/‎docs/modules/components/pages/outputs/gcp_bigquery_write_api.adoc‎
Lines changed: 148 additions & 0 deletions b/‎docs/modules/components/pages/outputs/gcp_bigquery_write_api.adoc‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎internal/impl/gcp/enterprise/bigquery/integration_test.go‎
Lines changed: 176 additions & 0 deletions b/‎internal/impl/gcp/enterprise/bigquery/integration_test.go‎
Lines changed: 176 additions & 0 deletions
@@ -66,6 +66,15 @@ output:
     dataset: "" # No default (required)
     table: "" # No default (required)
     message_format: json
+    write_mode: default_stream
+    auto_create_table: false
+    schema: []
+    time_partitioning:
+      type: "" # No default (optional)
+      field: ""
+      expiration: 0s
+      require_filter: false
+    clustering: []
     max_in_flight: 4
     batching:
       count: 0
@@ -104,6 +113,21 @@ All messages in the same batch are written to that table.
 The interpolated table name is sanitized for BigQuery: dots, hyphens, slashes and whitespace are replaced with underscores, non-ASCII-alphanumeric characters are stripped, leading digits are prefixed with `_`, and the result is truncated to 1024 characters.
 A name that sanitizes to the empty string is rejected as a permanent error.
 
+== Write modes
+
+The `write_mode` field selects between two write paths:
+
+- `default_stream` (default): the multiplexed default stream. Lowest latency, at-least-once semantics.
+- `pending_stream`: a fresh pending stream is allocated per batch; rows are written with sequential offsets, the stream is finalized, then atomically committed. Provides exactly-once semantics within a single committed batch.
+
+== Auto-create
+
+When `auto_create_table` is true, the output creates missing tables on the fly using the configured `schema`, `time_partitioning`, and `clustering`. `AlreadyExists` errors from concurrent creators are treated as success. When the table name is interpolated, every auto-created table receives the same configuration.
+
+== Exactly-once caveat
+
+The exactly-once guarantee of `pending_stream` is "exactly-once within a stream". If a BatchCommitWriteStreams RPC succeeds but its response is lost to a network failure, benthos retries the batch through a new pending stream and the data lands twice. This is a fundamental limitation of the BigQuery Storage Write API exactly-once contract and applies to every implementation.
+
 
 == Fields
 
@@ -147,6 +171,130 @@ Options:
 , `protobuf`
 .
 
+=== `write_mode`
+
+How the output writes to BigQuery. `default_stream` uses the multiplexed default stream (at-least-once, lowest latency). `pending_stream` allocates a per-batch pending stream that commits atomically, providing exactly-once semantics within a single committed batch.
+
+
+*Type*: `string`
+
+*Default*: `"default_stream"`
+
+Options:
+`default_stream`
+, `pending_stream`
+.
+
+=== `auto_create_table`
+
+If true and the target table does not exist, the output creates it using the configured `schema`, `time_partitioning`, and `clustering`. AlreadyExists errors from concurrent creators are treated as success. When the table name is interpolated, every auto-created table receives the same schema and partition/clustering settings.
+
+
+*Type*: `bool`
+
+*Default*: `false`
+
+=== `schema`
+
+Column definitions used by `auto_create_table`. Required when `auto_create_table` is true.
+
+
+*Type*: `array`
+
+*Default*: `[]`
+
+=== `schema[].name`
+
+Column name.
+
+
+*Type*: `string`
+
+
+=== `schema[].type`
+
+BigQuery column type (STRING, BYTES, INTEGER/INT64, FLOAT/FLOAT64, NUMERIC, BIGNUMERIC, BOOLEAN/BOOL, TIMESTAMP, DATE, TIME, DATETIME, GEOGRAPHY, JSON, RECORD).
+
+
+*Type*: `string`
+
+
+=== `schema[].mode`
+
+Column mode: NULLABLE (default), REQUIRED, or REPEATED.
+
+
+*Type*: `string`
+
+*Default*: `"NULLABLE"`
+
+=== `schema[].fields`
+
+For RECORD columns, the list of nested fields. Same shape as the top-level schema list.
+
+
+*Type*: `array`
+
+
+=== `time_partitioning`
+
+Optional time-partitioning settings applied during `auto_create_table`. Setting `type` is the trigger — when omitted, the block is treated as absent.
+
+
+*Type*: `object`
+
+
+=== `time_partitioning.type`
+
+Partitioning granularity.
+
+
+*Type*: `string`
+
+
+Options:
+`DAY`
+, `HOUR`
+, `MONTH`
+, `YEAR`
+.
+
+=== `time_partitioning.field`
+
+Column to partition on. Must be of type DATE, TIMESTAMP, or DATETIME. If empty, the table uses ingestion-time partitioning (`_PARTITIONTIME`).
+
+
+*Type*: `string`
+
+*Default*: `""`
+
+=== `time_partitioning.expiration`
+
+Optional partition expiration. Zero means no expiration.
+
+
+*Type*: `string`
+
+*Default*: `"0s"`
+
+=== `time_partitioning.require_filter`
+
+If true, queries against the table must filter on the partition column.
+
+
+*Type*: `bool`
+
+*Default*: `false`
+
+=== `clustering`
+
+Optional clustering columns (up to 4) applied during `auto_create_table`. All names must appear in `schema`.
+
+
+*Type*: `array`
+
+*Default*: `[]`
+
 === `max_in_flight`
 
 The maximum number of messages to have in flight at a given time. Increase this to improve throughput.
 
@@ -250,6 +250,182 @@ func TestIntegrationSchemaEvolution(t *testing.T) {
 	assert.True(t, evolved, "missing==0 must signal retry, not a permanent failure, so concurrent batches are not dropped to DLQ")
 }
 
+func TestIntegrationAutoCreateTable(t *testing.T) {
+	integration.CheckSkip(t)
+
+	const (
+		projectID = "test-project"
+		datasetID = "test_dataset"
+		tableID   = "auto_created"
+	)
+
+	emu := startEmulator(t, projectID, datasetID)
+
+	t.Log("Given the table does not exist and auto_create_table is enabled")
+
+	sb := service.NewStreamBuilder()
+	require.NoError(t, sb.SetLoggerYAML(`level: DEBUG`))
+
+	sendFn, err := sb.AddProducerFunc()
+	require.NoError(t, err)
+
+	// Schema kept simple (STRING columns + ingestion-time partitioning) so the
+	// test exercises auto-create + partitioning + clustering without tripping
+	// over the protojson↔INT64 representation rules that TIMESTAMP/INTEGER
+	// columns require.
+	require.NoError(t, sb.AddOutputYAML(fmt.Sprintf(`
+gcp_bigquery_write_api:
+  project: %s
+  dataset: %s
+  table: %s
+  auto_create_table: true
+  schema:
+    - { name: id, type: STRING, mode: REQUIRED }
+    - { name: payload, type: STRING }
+    - { name: tenant_id, type: STRING }
+  time_partitioning:
+    type: DAY
+  clustering: [tenant_id]
+  endpoint:
+    http: %s
+    grpc: %s
+`, projectID, datasetID, tableID, emu.httpEndpoint, emu.grpcEndpoint)))
+
+	stream, err := sb.Build()
+	require.NoError(t, err)
+	license.InjectTestService(stream.Resources())
+
+	go func() {
+		if err := stream.Run(t.Context()); err != nil && !errors.Is(err, context.Canceled) {
+			t.Errorf("stream error: %v", err)
+		}
+	}()
+	t.Cleanup(func() {
+		if err := stream.StopWithin(10 * time.Second); err != nil {
+			t.Log(err)
+		}
+	})
+
+	require.NoError(t, sendFn(t.Context(), service.NewMessage([]byte(
+		`{"id":"a","payload":"hello","tenant_id":"t1"}`))))
+
+	t.Log("Then the table is auto-created with the configured schema")
+	require.Eventually(t, func() bool {
+		_, err := emu.bqClient.Dataset(datasetID).Table(tableID).Metadata(t.Context())
+		return err == nil
+	}, 10*time.Second, 250*time.Millisecond)
+
+	meta, err := emu.bqClient.Dataset(datasetID).Table(tableID).Metadata(t.Context())
+	require.NoError(t, err)
+	require.Len(t, meta.Schema, 3)
+	var names []string
+	for _, f := range meta.Schema {
+		names = append(names, f.Name)
+	}
+	assert.ElementsMatch(t, []string{"id", "payload", "tenant_id"}, names)
+	// Partitioning/clustering metadata may be a no-op on the emulator — guard
+	// so this test still proves auto-create works without coupling to emulator
+	// completeness.
+	if meta.TimePartitioning != nil {
+		assert.Equal(t, bigquery.DayPartitioningType, meta.TimePartitioning.Type)
+		// Ingestion-time partitioning leaves Field empty (uses _PARTITIONTIME).
+		assert.Empty(t, meta.TimePartitioning.Field)
+	}
+	if meta.Clustering != nil {
+		assert.Equal(t, []string{"tenant_id"}, meta.Clustering.Fields)
+	}
+
+	t.Log("And the row lands in the table")
+	require.Eventually(t, func() bool {
+		it := emu.bqClient.Dataset(datasetID).Table(tableID).Read(t.Context())
+		var count int
+		for {
+			var row map[string]bigquery.Value
+			if err := it.Next(&row); errors.Is(err, iterator.Done) {
+				break
+			} else if err != nil {
+				return false
+			}
+			count++
+		}
+		return count >= 1
+	}, 30*time.Second, 500*time.Millisecond)
+}
+
+func TestIntegrationPendingStreamMode(t *testing.T) {
+	integration.CheckSkip(t)
+	// The goccy/bigquery-emulator (used by these integration tests) does not
+	// implement the Pending write-stream type or BatchCommitWriteStreams; the
+	// test hangs waiting for the commit RPC. Skip until the emulator gains
+	// support or these tests can target real BigQuery in a nightly job.
+	t.Skip("goccy bigquery-emulator does not implement Pending streams / BatchCommitWriteStreams")
+
+	const (
+		projectID = "test-project"
+		datasetID = "test_dataset"
+		tableID   = "pending_test"
+	)
+
+	emu := startEmulator(t, projectID, datasetID)
+
+	t.Log("Given a table exists for pending-stream writes")
+	require.NoError(t, emu.bqClient.Dataset(datasetID).Table(tableID).Create(t.Context(), &bigquery.TableMetadata{
+		Schema: bigquery.Schema{
+			{Name: "id", Type: bigquery.StringFieldType, Required: true},
+		},
+	}))
+
+	sb := service.NewStreamBuilder()
+	require.NoError(t, sb.SetLoggerYAML(`level: DEBUG`))
+
+	sendFn, err := sb.AddProducerFunc()
+	require.NoError(t, err)
+
+	require.NoError(t, sb.AddOutputYAML(fmt.Sprintf(`
+gcp_bigquery_write_api:
+  project: %s
+  dataset: %s
+  table: %s
+  write_mode: pending_stream
+  batching:
+    count: 3
+  endpoint:
+    http: %s
+    grpc: %s
+`, projectID, datasetID, tableID, emu.httpEndpoint, emu.grpcEndpoint)))
+
+	stream, err := sb.Build()
+	require.NoError(t, err)
+	license.InjectTestService(stream.Resources())
+
+	go func() {
+		if err := stream.Run(t.Context()); err != nil && !errors.Is(err, context.Canceled) {
+			t.Errorf("stream error: %v", err)
+		}
+	}()
+	t.Cleanup(func() { _ = stream.StopWithin(10 * time.Second) })
+
+	for _, id := range []string{"a", "b", "c"} {
+		require.NoError(t, sendFn(t.Context(), service.NewMessage([]byte(`{"id":"`+id+`"}`))))
+	}
+
+	t.Log("Then all 3 rows are committed atomically and visible")
+	require.Eventually(t, func() bool {
+		it := emu.bqClient.Dataset(datasetID).Table(tableID).Read(t.Context())
+		var count int
+		for {
+			var row map[string]bigquery.Value
+			if err := it.Next(&row); errors.Is(err, iterator.Done) {
+				break
+			} else if err != nil {
+				return false
+			}
+			count++
+		}
+		return count == 3
+	}, 30*time.Second, 500*time.Millisecond)
+}
+
 func TestIntegrationTableNameSanitization(t *testing.T) {
 	integration.CheckSkip(t)