fix: Fix streaming materialization for exotic sources with lazy UDF pipelines

ntkathole · ntkathole · commit c07972dd09d2 · 2026-05-03T22:13:29.000+05:30
Signed-off-by: ntkathole &lt;nikhilkathole2683@gmail.com&gt;
diff --git a/sdk/python/feast/infra/common/serde.py b/sdk/python/feast/infra/common/serde.py
@@ -30,7 +30,8 @@ def unserialize(self):
         # unserialize
         proto = FeatureViewProto()
         proto.ParseFromString(self.feature_view_proto)
-        feature_view = FeatureView.from_proto(proto)
+        # skip_udf=True: the write node only needs schema / entity metadata.
+        feature_view = FeatureView.from_proto(proto, skip_udf=True)
 
         # load
         repo_config = dill.loads(self.repo_config_byte)
diff --git a/sdk/python/feast/infra/compute_engines/ray/config.py b/sdk/python/feast/infra/compute_engines/ray/config.py
@@ -41,7 +41,20 @@ class RayComputeEngineConfig(FeastConfigBaseModel):
 
     # Additional configuration options
     max_workers: Optional[int] = None
-    """Maximum number of Ray workers. If None, uses all available cores."""
+    """Maximum number of Ray workers for transformation and join nodes.
+    If None, Ray uses all available cores."""
+
+    write_concurrency: Optional[int] = None
+    """Concurrency for the RayWriteNode's map_batches call (online-store writes).
+    If None, falls back to max_workers, then 1 (safe default
+    for single-file stores).
+
+    Example - SQLite online store (default for local deployments):
+      write_concurrency: 1
+
+    Example - Redis / DynamoDB online store (supports parallel writes):
+      write_concurrency: 8
+    """
 
     enable_optimization: bool = True
     """Enable automatic performance optimizations."""
diff --git a/sdk/python/feast/infra/compute_engines/ray/feature_builder.py b/sdk/python/feast/infra/compute_engines/ray/feature_builder.py
@@ -136,6 +136,7 @@ def build_dedup_node(self, view, input_node):
             name="dedup",
             column_info=column_info,
             config=self.config,
+            is_materialization=self.is_materialization,
         )
         node.add_input(input_node)
 
diff --git a/sdk/python/feast/infra/compute_engines/ray/nodes.py b/sdk/python/feast/infra/compute_engines/ray/nodes.py
@@ -559,17 +559,33 @@ def _fallback_pandas_aggregation(self, dataset: Dataset, agg_dict: dict) -> Data
 class RayDedupNode(DAGNode):
     """
     Ray node for deduplicating records.
+
+    Two dedup strategies are provided:
+
+    * **Materialization** (``is_materialization=True``): per-block
+      ``drop_duplicates``.  This is streaming-friendly because it never needs
+      to see all blocks at once.  Any cross-block duplicates are resolved by
+      the online store, which does an UPSERT and therefore naturally keeps the
+      last-written value.  This avoids the ``groupby().map_groups()`` full
+      shuffle that would otherwise block until every single block was produced.
+
+    * **Historical retrieval** (``is_materialization=False``): global
+      ``groupby().map_groups()``.  Correctness is required here because the
+      entity-timestamp join must return exactly one feature row per
+      (entity, query-timestamp) pair.
     """
 
     def __init__(
         self,
         name: str,
         column_info,
         config: RayComputeEngineConfig,
+        is_materialization: bool = False,
     ):
         super().__init__(name)
         self.column_info = column_info
         self.config = config
+        self.is_materialization = is_materialization
 
     def execute(self, context: ExecutionContext) -> DAGValue:
         """Execute the deduplication operation."""
@@ -581,26 +597,54 @@ def execute(self, context: ExecutionContext) -> DAGValue:
         timestamp_col = self.column_info.timestamp_column
 
         if join_keys:
-            available_join_keys = [k for k in join_keys if k in dataset.schema().names]
-            available_ts_col = (
-                timestamp_col if timestamp_col in dataset.schema().names else None
-            )
-
-            if available_join_keys:
-                # groupby().map_groups() co-locates ALL rows for the same entity
-                # in a single call, so deduplication is always correct regardless
-                # of how Ray splits the dataset into partitions.  sort + map_batches
-                # is NOT safe: Ray can place the same entity's rows in different
-                # partitions after a sort, causing surviving duplicates.
-                def _keep_latest_in_group(group: pd.DataFrame) -> pd.DataFrame:
-                    if available_ts_col and available_ts_col in group.columns:
-                        group = group.sort_values(available_ts_col, ascending=False)
-                    return group.head(1)
-
-                dataset = dataset.groupby(available_join_keys).map_groups(
-                    _keep_latest_in_group, batch_format="pandas"
+            if self.is_materialization:
+                # Per-block dedup: streaming-safe, no full shuffle required.
+                # Cross-block duplicates are handled by the online-store UPSERT.
+                #
+                # IMPORTANT: do NOT call dataset.schema() here.  For streaming
+                # datasets backed by slow map_batches actors, .schema() triggers
+                # eager block execution to
+                # infer the output type.  Those blocks are consumed and LOST —
+                # they never reach the write stage.  We therefore defer the
+                # column-existence check to inside _dedup_block, which runs in
+                # a worker per block without interfering with streaming.
+                _join_keys = list(join_keys)
+                _ts_col = timestamp_col
+
+                def _dedup_block(block: pd.DataFrame) -> pd.DataFrame:
+                    available = [k for k in _join_keys if k in block.columns]
+                    if not available:
+                        return block
+                    if _ts_col and _ts_col in block.columns:
+                        block = block.sort_values(_ts_col, ascending=False)
+                    return block.drop_duplicates(subset=available)
+
+                dataset = dataset.map_batches(_dedup_block, batch_format="pandas")
+            else:
+                # Global dedup via groupby: required for historical retrieval
+                # where the entity–timestamp join must return exactly one row
+                # per (entity, query-timestamp) pair.
+                # NOTE: groupby().map_groups() is a full shuffle and blocks
+                # until ALL upstream blocks are produced.  Use only when
+                # correctness across partition boundaries is mandatory.
+                available_join_keys = [
+                    k for k in join_keys if k in dataset.schema().names
+                ]
+                available_ts_col = (
+                    timestamp_col if timestamp_col in dataset.schema().names else None
                 )
 
+                if available_join_keys:
+
+                    def _keep_latest_in_group(group: pd.DataFrame) -> pd.DataFrame:
+                        if available_ts_col and available_ts_col in group.columns:
+                            group = group.sort_values(available_ts_col, ascending=False)
+                        return group.head(1)
+
+                    dataset = dataset.groupby(available_join_keys).map_groups(
+                        _keep_latest_in_group, batch_format="pandas"
+                    )
+
         deduped_dataset = dataset
 
         return DAGValue(
@@ -848,10 +892,19 @@ def write_batch_with_serialized_artifacts(batch: pd.DataFrame) -> pd.DataFrame:
 
             return batch
 
+        # Resolve write concurrency from config.
+        # write_concurrency takes precedence; falls back to max_workers, then 1.
+        if self.config is not None and self.config.write_concurrency is not None:
+            _write_concurrency = self.config.write_concurrency
+        elif self.config is not None and self.config.max_workers is not None:
+            _write_concurrency = self.config.max_workers
+        else:
+            _write_concurrency = 1
+
         written_dataset = dataset.map_batches(
             write_batch_with_serialized_artifacts,
             batch_format="pandas",
-            concurrency=self.config.max_workers if self.config else 12,
+            concurrency=_write_concurrency,
         )
         written_dataset = written_dataset.materialize()
 
diff --git a/sdk/python/feast/infra/offline_stores/contrib/ray_offline_store/ray.py b/sdk/python/feast/infra/offline_stores/contrib/ray_offline_store/ray.py
@@ -1793,37 +1793,49 @@ def _load_and_filter_dataset_ray(
             if pre_loaded_ds is not None:
                 ds = pre_loaded_ds
 
-                # Normalize the timestamp column BEFORE the filter so that
-                # non-Parquet sources (CSV, JSON, SQL) whose raw dataset may
-                # contain strings or tz-naive datetimes can be compared against
-                # the tz-aware datetime bounds below without raising TypeError.
-                # This mirrors what _create_filtered_dataset does for file-based
-                # sources as part of its read pipeline.
-                if timestamp_field:
-                    ts_cols_to_norm = [timestamp_field]
-                    if created_timestamp_column:
-                        ts_cols_to_norm.append(created_timestamp_column)
-                    ds = ensure_timestamp_compatibility(ds, ts_cols_to_norm)
-
-                # Apply time-range filter inline (done by _create_filtered_dataset
-                # for path-based sources).
-                def _normalize(dt: Optional[datetime]) -> Optional[datetime]:
-                    return make_tzaware(dt) if dt and dt.tzinfo is None else dt
-
-                s_date = _normalize(start_date)
-                e_date = _normalize(end_date)
-                ts_col = timestamp_field
-
-                if s_date and e_date:
-                    ds = ds.filter(
-                        lambda batch, s=s_date, e=e_date, col=ts_col: (
-                            (batch[col] >= s) & (batch[col] <= e)
+                # Normalise timestamps and apply time-range filter inside
+                # map_batches so that ds.schema() is NEVER called eagerly.
+                # Column-existence checks are deferred to each batch so that
+                # exotic sources whose timestamp column is synthesised inside a
+                # downstream UDF (e.g. HuggingFace image datasets) are handled
+                # gracefully: normalization and filtering are simply skipped for
+                # batches that do not yet contain the column.
+                _ts_field = timestamp_field
+                _created_ts = created_timestamp_column
+                _s_date = (
+                    make_tzaware(start_date)
+                    if start_date and start_date.tzinfo is None
+                    else start_date
+                )
+                _e_date = (
+                    make_tzaware(end_date)
+                    if end_date and end_date.tzinfo is None
+                    else end_date
+                )
+
+                def _norm_and_filter(batch: pd.DataFrame) -> pd.DataFrame:
+                    batch = make_df_tzaware(batch)
+                    for col in [
+                        c for c in [_ts_field, _created_ts] if c and c in batch.columns
+                    ]:
+                        batch[col] = (
+                            pd.to_datetime(batch[col], utc=True, errors="coerce")
+                            .dt.floor("s")
+                            .astype("datetime64[ns, UTC]")
                         )
-                    )
-                elif s_date:
-                    ds = ds.filter(lambda batch, s=s_date, col=ts_col: batch[col] >= s)
-                elif e_date:
-                    ds = ds.filter(lambda batch, e=e_date, col=ts_col: batch[col] <= e)
+                    if _ts_field and _ts_field in batch.columns:
+                        if _s_date and _e_date:
+                            batch = batch[
+                                (batch[_ts_field] >= _s_date)
+                                & (batch[_ts_field] <= _e_date)
+                            ]
+                        elif _s_date:
+                            batch = batch[batch[_ts_field] >= _s_date]
+                        elif _e_date:
+                            batch = batch[batch[_ts_field] <= _e_date]
+                    return batch
+
+                ds = ds.map_batches(_norm_and_filter, batch_format="pandas")
             else:
                 if not feature_name_columns:
                     columns_to_read = None
diff --git a/sdk/python/tests/component/ray/test_nodes.py b/sdk/python/tests/component/ray/test_nodes.py
@@ -299,6 +299,146 @@ def test_ray_dedup_node(
     assert "driver_id" in result_df.columns
 
 
+def test_ray_dedup_node_materialization_within_block(
+    ray_session, ray_config, mock_context, column_info
+):
+    """Materialization path: within-block duplicates are removed and the row
+    with the latest event_timestamp is kept.
+
+    is_materialization=True uses per-block map_batches (streaming-safe).
+    No ds.schema() call should be triggered.
+    """
+    now = datetime.now()
+    older_ts = now - timedelta(hours=3)
+    newer_ts = now - timedelta(hours=1)
+
+    block = pd.DataFrame(
+        [
+            {
+                "driver_id": 1001,
+                "event_timestamp": older_ts,
+                "conv_rate": 0.5,
+            },
+            {
+                "driver_id": 1001,
+                "event_timestamp": newer_ts,
+                "conv_rate": 0.8,
+            },
+            {
+                "driver_id": 1002,
+                "event_timestamp": now - timedelta(hours=2),
+                "conv_rate": 0.7,
+            },
+        ]
+    )
+
+    ray_dataset = ray.data.from_pandas(block)
+    input_value = DAGValue(data=ray_dataset, format=DAGFormat.RAY)
+    dummy_node = DummyInputNode("input_node", input_value)
+    node = RayDedupNode(
+        name="dedup",
+        column_info=column_info,
+        config=ray_config,
+        is_materialization=True,
+    )
+    node.add_input(dummy_node)
+    mock_context.node_outputs = {"input_node": input_value}
+
+    result = node.execute(mock_context)
+    result_df = result.data.to_pandas().sort_values("driver_id").reset_index(drop=True)
+
+    assert len(result_df) == 2, "One row per entity should survive within the block"
+    driver_1001 = result_df[result_df["driver_id"] == 1001].iloc[0]
+    assert driver_1001["event_timestamp"] == newer_ts, (
+        "Latest timestamp should be kept for driver 1001"
+    )
+
+
+def test_ray_dedup_node_materialization_cross_block_duplicates_survive(
+    ray_session, ray_config, mock_context, column_info
+):
+    """Materialization path: the same entity in two *different* blocks both
+    survive — cross-block dedup is delegated to the online-store UPSERT.
+
+    This validates the per-block (streaming-safe) semantics: a global shuffle
+    is intentionally avoided so that slow upstream actors (EasyOCR, CLIP, etc.)
+    do not need to finish all blocks before writes begin.
+    """
+    now = datetime.now()
+    block_a = pd.DataFrame(
+        [
+            {
+                "driver_id": 1001,
+                "event_timestamp": now - timedelta(hours=3),
+                "conv_rate": 0.5,
+            }
+        ]
+    )
+    block_b = pd.DataFrame(
+        [
+            {
+                "driver_id": 1001,
+                "event_timestamp": now - timedelta(hours=1),
+                "conv_rate": 0.8,
+            }
+        ]
+    )
+
+    # Force two separate Ray blocks by passing a list of DataFrames.
+    ray_dataset = ray.data.from_pandas([block_a, block_b])
+    input_value = DAGValue(data=ray_dataset, format=DAGFormat.RAY)
+    dummy_node = DummyInputNode("input_node", input_value)
+    node = RayDedupNode(
+        name="dedup",
+        column_info=column_info,
+        config=ray_config,
+        is_materialization=True,
+    )
+    node.add_input(dummy_node)
+    mock_context.node_outputs = {"input_node": input_value}
+
+    result = node.execute(mock_context)
+    result_df = result.data.to_pandas()
+
+    assert len(result_df) == 2, (
+        "Both blocks should each contribute one row; "
+        "cross-block dedup is the online store's responsibility"
+    )
+
+
+def test_ray_dedup_node_materialization_no_join_keys(
+    ray_session, ray_config, mock_context, sample_data
+):
+    """Materialization path: when no join keys are present all rows pass through
+    unchanged (there is nothing to deduplicate on).
+    """
+    empty_column_info = ColumnInfo(
+        join_keys=[],
+        feature_cols=["conv_rate", "acc_rate", "avg_daily_trips"],
+        ts_col="event_timestamp",
+        created_ts_col="created",
+        field_mapping=None,
+    )
+    ray_dataset = ray.data.from_pandas(sample_data)
+    input_value = DAGValue(data=ray_dataset, format=DAGFormat.RAY)
+    dummy_node = DummyInputNode("input_node", input_value)
+    node = RayDedupNode(
+        name="dedup",
+        column_info=empty_column_info,
+        config=ray_config,
+        is_materialization=True,
+    )
+    node.add_input(dummy_node)
+    mock_context.node_outputs = {"input_node": input_value}
+
+    result = node.execute(mock_context)
+    result_df = result.data.to_pandas()
+
+    assert len(result_df) == len(sample_data), (
+        "All rows should survive when there are no join keys to deduplicate on"
+    )
+
+
 def test_ray_config_validation():
     """Test Ray configuration validation."""
     # Test valid configuration

Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ def build_dedup_node(self, view, input_node):`
`136`	`136`	`name="dedup",`
`137`	`137`	`column_info=column_info,`
`138`	`138`	`config=self.config,`
	`139`	`+ is_materialization=self.is_materialization,`
`139`	`140`	`)`
`140`	`141`	`node.add_input(input_node)`
`141`	`142`