ExpediaGroup
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/python/feast/infra/contrib/spark_kafka_processor.py‎
Lines changed: 118 additions & 4 deletions b/‎sdk/python/feast/infra/contrib/spark_kafka_processor.py‎
Lines changed: 118 additions & 4 deletions
diff --git a/‎sdk/python/tests/unit/infra/contrib/__init__.py‎ b/‎sdk/python/tests/unit/infra/contrib/__init__.py‎
@@ -246,7 +246,7 @@ requires = [
   "pybindgen==0.22.0",
   # https://amitylearning.vercel.app/?question=git-1742419854670&update=1742342400027
   "setuptools>=60,<81",
-  "setuptools_scm>=6.2",
+  "setuptools_scm>=6.2,<10",
   "sphinx!=4.0.0",
   "wheel",
 ]
 
@@ -1,6 +1,9 @@
+import logging
 import os
+import random
+import time
 from types import MethodType
-from typing import List, Optional, Set, Union, no_type_check
+from typing import Callable, List, Optional, Set, Union, no_type_check
 
 import pandas as pd
 from pyspark import SparkContext
@@ -23,6 +26,89 @@
 from feast.sorted_feature_view import SortedFeatureView
 from feast.stream_feature_view import StreamFeatureView
 
+logger = logging.getLogger(__name__)
+
+# Patterns that indicate transient errors which should be retried
+TRANSIENT_ERROR_PATTERNS = [
+    "writetimeout",
+    "readtimeout",
+    "unavailable",
+    "operationtimedout",
+    "nohostsavailable",
+    "connection refused",
+    "connection reset",
+    "overloaded",
+]
+
+
+def _is_transient_error(exc: Exception) -> bool:
+    """Check if an exception is a transient error that should be retried."""
+    exc_str = str(exc).lower()
+    exc_type = type(exc).__name__.lower()
+
+    for pattern in TRANSIENT_ERROR_PATTERNS:
+        if pattern in exc_str or pattern in exc_type:
+            return True
+    return False
+
+
+def _write_with_retry(
+    write_fn: Callable[[], None],
+    operation_name: str,
+    max_retries: int = 3,
+    base_delay: float = 1.0,
+    max_delay: float = 30.0,
+) -> None:
+    """
+    Execute a write function with exponential backoff retry for transient errors.
+
+    Args:
+        write_fn: The write function to execute
+        operation_name: Name of the operation for logging
+        max_retries: Maximum number of retry attempts
+        base_delay: Base delay in seconds for exponential backoff
+        max_delay: Maximum delay in seconds between retries
+
+    Raises:
+        Exception: The last exception if all retries are exhausted or if a
+                   non-transient error occurs
+    """
+    for attempt in range(max_retries + 1):
+        try:
+            write_fn()
+            if attempt > 0:
+                logger.info(
+                    f"[{operation_name}] Succeeded after {attempt} retry attempt(s)"
+                )
+            return  # Success
+        except Exception as e:
+            if not _is_transient_error(e):
+                # Permanent error - don't retry, bubble up immediately
+                logger.error(
+                    f"[{operation_name}] Permanent error (not retrying): "
+                    f"{type(e).__name__}: {e}"
+                )
+                raise
+
+            if attempt < max_retries:
+                # Calculate delay with exponential backoff + jitter
+                delay = min(base_delay * (2**attempt), max_delay)
+                jitter = random.uniform(0, delay * 0.1)
+                total_delay = delay + jitter
+
+                logger.warning(
+                    f"[{operation_name}] Transient error, retry {attempt + 1}/{max_retries} "
+                    f"after {total_delay:.2f}s: {type(e).__name__}: {e}"
+                )
+                time.sleep(total_delay)
+            else:
+                # Max retries exceeded - bubble up the exception
+                logger.error(
+                    f"[{operation_name}] Max retries ({max_retries}) exceeded: "
+                    f"{type(e).__name__}: {e}"
+                )
+                raise
+
 
 class SparkProcessorConfig(ProcessorConfig):
     """spark_kafka_options, schema_registry_config and checkpoint_location are only used for ConfluentAvroFormat"""
@@ -279,11 +365,28 @@ def batch_write(row: DataFrame, batch_id: int):
                 rows = self.preprocess_fn(rows)
 
             # Finally persist the data to the online store and/or offline store.
+            # Use retry with exponential backoff for transient errors.
             if rows.size > 0:
                 if to == PushMode.ONLINE or to == PushMode.ONLINE_AND_OFFLINE:
-                    self.fs.write_to_online_store(self.sfv.name, rows)
+                    _write_with_retry(
+                        write_fn=lambda: self.fs.write_to_online_store(
+                            self.sfv.name, rows
+                        ),
+                        operation_name=f"write_to_online_store[{self.sfv.name}][batch_id={batch_id}]",
+                        max_retries=3,
+                        base_delay=1.0,
+                        max_delay=30.0,
+                    )
                 if to == PushMode.OFFLINE or to == PushMode.ONLINE_AND_OFFLINE:
-                    self.fs.write_to_offline_store(self.sfv.name, rows)
+                    _write_with_retry(
+                        write_fn=lambda: self.fs.write_to_offline_store(
+                            self.sfv.name, rows
+                        ),
+                        operation_name=f"write_to_offline_store[{self.sfv.name}][batch_id={batch_id}]",
+                        max_retries=3,
+                        base_delay=1.0,
+                        max_delay=30.0,
+                    )
 
         query = (
             df.writeStream.outputMode("update")
@@ -293,5 +396,16 @@ def batch_write(row: DataFrame, batch_id: int):
             .start()
         )
 
-        query.awaitTermination(timeout=self.query_timeout)
+        terminated = query.awaitTermination(timeout=self.query_timeout)
+
+        if terminated:
+            # Query terminated before timeout - check if it was an error
+            # This ensures exceptions from batch_write() bubble up to the caller
+            query_exception = query.exception()
+            if query_exception is not None:
+                logger.error(
+                    f"Streaming query terminated with exception: {query_exception}"
+                )
+                raise query_exception
+
         return query
Original file line number	Diff line number	Diff line change
`@@ -246,7 +246,7 @@ requires = [`
`246`	`246`	`"pybindgen==0.22.0",`
`247`	`247`	`# https://amitylearning.vercel.app/?question=git-1742419854670&update=1742342400027`
`248`	`248`	`"setuptools>=60,<81",`
`249`		`- "setuptools_scm>=6.2",`
	`249`	`+ "setuptools_scm>=6.2,<10",`
`250`	`250`	`"sphinx!=4.0.0",`
`251`	`251`	`"wheel",`
`252`	`252`	`]`