2121import org .apache .fluss .client .ConnectionFactory ;
2222import org .apache .fluss .client .admin .Admin ;
2323import org .apache .fluss .client .initializer .OffsetsInitializer ;
24+ import org .apache .fluss .client .initializer .SnapshotOffsetsInitializer ;
2425import org .apache .fluss .config .ConfigOptions ;
2526import org .apache .fluss .config .Configuration ;
2627import org .apache .fluss .flink .FlinkConnectorOptions ;
2728import org .apache .fluss .flink .source .deserializer .FlussDeserializationSchema ;
29+ import org .apache .fluss .flink .utils .LakeSourceUtils ;
30+ import org .apache .fluss .lake .source .LakeSource ;
31+ import org .apache .fluss .lake .source .LakeSplit ;
2832import org .apache .fluss .metadata .TableInfo ;
2933import org .apache .fluss .metadata .TablePath ;
3034import org .apache .fluss .predicate .Predicate ;
3337import org .slf4j .Logger ;
3438import org .slf4j .LoggerFactory ;
3539
40+ import java .util .Collections ;
3641import java .util .HashMap ;
3742import java .util .List ;
3843import java .util .Map ;
6065 * .build();
6166 * }</pre>
6267 *
68+ * <p>When the target table has datalake enabled and the source starts in full mode (the default,
69+ * {@link OffsetsInitializer#full()}), the built source performs a union read: it reads the
70+ * historical data tiered to the lake (e.g. Iceberg, Paimon) together with the real-time data still
71+ * in Fluss. Other startup modes (earliest/latest/timestamp) read data from Fluss only.
72+ *
6373 * @param <OUT> The type of records produced by the source being built
6474 */
6575public class FlussSourceBuilder <OUT > {
@@ -73,6 +83,7 @@ public class FlussSourceBuilder<OUT> {
7383 private Long scanPartitionDiscoveryIntervalMs ;
7484 private Integer splitPerAssignmentBatchSize ;
7585 private OffsetsInitializer offsetsInitializer ;
86+ private boolean bounded ;
7687 private FlussDeserializationSchema <OUT > deserializationSchema ;
7788
7889 private String bootstrapServers ;
@@ -161,6 +172,19 @@ public FlussSourceBuilder<OUT> setStartingOffsets(OffsetsInitializer offsetsInit
161172 return this ;
162173 }
163174
175+ /**
176+ * Builds a bounded source for batch execution. The source reads up to the latest offsets at job
177+ * startup and then finishes; combined with the default {@link OffsetsInitializer#full()} on a
178+ * datalake-enabled table this performs a bounded union read of the lake snapshot and the Fluss
179+ * log. If not called, the source is unbounded (streaming).
180+ *
181+ * @return this builder
182+ */
183+ public FlussSourceBuilder <OUT > setBounded () {
184+ this .bounded = true ;
185+ return this ;
186+ }
187+
164188 /**
165189 * Sets the deserialization schema for converting Fluss records to output records.
166190 *
@@ -324,6 +348,40 @@ public FlussSource<OUT> build() {
324348 ? tableInfo .getRowType ().project (projectedFields )
325349 : tableInfo .getRowType ();
326350
351+ // union read (lake historical + Fluss) only applies to full startup mode, like the SQL
352+ // connector; other startup modes read Fluss only.
353+ boolean lakeEnabled = tableInfo .getTableConfig ().isDataLakeEnabled ();
354+ boolean fullStartup = offsetsInitializer instanceof SnapshotOffsetsInitializer ;
355+
356+ if (bounded && !(lakeEnabled && fullStartup )) {
357+ throw new IllegalArgumentException (
358+ String .format (
359+ "Bounded (batch) read requires a datalake-enabled table started in "
360+ + "full mode (OffsetsInitializer.full()), but table '%s' has "
361+ + "datalake enabled=%s and full startup mode=%s." ,
362+ tablePath , lakeEnabled , fullStartup ));
363+ }
364+
365+ LakeSource <LakeSplit > lakeSource = null ;
366+ if (lakeEnabled && fullStartup ) {
367+ lakeSource =
368+ LakeSourceUtils .createLakeSource (tablePath , tableInfo .getProperties ().toMap ());
369+ if (lakeSource != null ) {
370+ if (projectedFields != null ) {
371+ int [][] nestedProjectedFields = new int [projectedFields .length ][];
372+ for (int i = 0 ; i < projectedFields .length ; i ++) {
373+ nestedProjectedFields [i ] = new int [] {projectedFields [i ]};
374+ }
375+ lakeSource .withProject (nestedProjectedFields );
376+ }
377+ // push the record-batch filter to the lake side as well,
378+ // so the historical lake scan is filtered consistently with Fluss.
379+ if (logRecordBatchFilter != null ) {
380+ lakeSource .withFilters (Collections .singletonList (logRecordBatchFilter ));
381+ }
382+ }
383+ }
384+
327385 LOG .info ("Creating Fluss Source with Configuration: {}" , flussConf );
328386
329387 return new FlussSource <>(
@@ -338,6 +396,7 @@ public FlussSource<OUT> build() {
338396 scanPartitionDiscoveryIntervalMs ,
339397 splitPerAssignmentBatchSize ,
340398 deserializationSchema ,
341- true );
399+ !bounded ,
400+ lakeSource );
342401 }
343402}
0 commit comments