Multifile cdc snapshot support (#14)

PremRajendran · web-flow · commit 5cfae0a204f1 · 2026-02-18T12:14:56.000+11:00
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-v0.4.1
+v0.5.0
diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_multifile_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_multifile_main.json
@@ -0,0 +1,34 @@
+{
+    "dataFlowId": "feature_historical_files_snapshot_datetime_multifile",
+    "dataFlowGroup": "feature_samples_snapshots",
+    "dataFlowType": "standard",
+    "targetFormat": "delta",
+    "targetDetails": {
+        "table": "feature_historical_snapshot_files_datetime_multifile",
+        "tableProperties": {
+            "delta.enableChangeDataFeed": "true"
+        },
+        "schemaPath": "target/customer_schema.json",
+        "configFlags": ["disableOperationalMetadata"]
+    },
+    "cdcSnapshotSettings": {
+        "keys": [
+            "CUSTOMER_ID"
+        ],
+        "scd_type": "2",
+        "snapshotType": "historical",
+        "sourceType": "file",
+        "source": {
+            "format": "csv",
+            "path": "{sample_file_location}/snapshot_customer_multifile/customer_{version}_split_{fragment}.csv",
+            "readerOptions": {
+                "header": "true"
+            },
+            "versionType": "timestamp",
+            "datetimeFormat": "%Y_%m_%d"
+        },
+        "track_history_except_column_list":[
+            "LOAD_TIMESTAMP"
+        ]
+    }
+}
diff --git a/samples/test_data_and_orchestrator/src/initialize.ipynb b/samples/test_data_and_orchestrator/src/initialize.ipynb
@@ -40,6 +40,7 @@
     "volume_root_file_path = f\"/Volumes/{staging_schema}/{staging_volume}\".replace(\".\", \"/\")\n",
     "customer_file_path = f\"{volume_root_file_path}/customer\"\n",
     "customer_snapshot_file_path = f\"{volume_root_file_path}/snapshot_customer\"\n",
+    "customer_snapshot_multifile_path = f\"{volume_root_file_path}/snapshot_customer_multifile\"\n",
     "customer_snapshot_partitioned_file_path = f\"{volume_root_file_path}/snapshot_customer_partitioned\"\n",
     "customer_snapshot_partitioned_parquet_file_path = f\"{volume_root_file_path}/snapshot_customer_partitioned_parquet\"\n",
     "template_samples_base_file_path = f\"{volume_root_file_path}/template_samples\"\n",
diff --git a/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb b/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb
@@ -277,6 +277,34 @@
     "dbutils.fs.put(\n",
     "  f\"{template_samples_customer_file_path}/customer_2024_02_10.csv\",\n",
     "  file_content,\n",
+    "  True)\n",
+    "\n",
+    "\n",
+    "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n",
+    "1,John,Doe,john.doe@example.com,,2024-01-01 10:00:00\\n\n",
+    "\"\"\"\n",
+    "\n",
+    "dbutils.fs.put(\n",
+    "  f\"{customer_snapshot_multifile_path}/customer_2024_01_01_split_0001.csv\",\n",
+    "  file_content,\n",
+    "  True)\n",
+    "\n",
+    "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n",
+    "2,Jane,Smith,jane.smith@example.com,,2024-01-01 10:00:00\\n\n",
+    "\"\"\"\n",
+    "\n",
+    "dbutils.fs.put(\n",
+    "  f\"{customer_snapshot_multifile_path}/customer_2024_01_01_split_0002.csv\",\n",
+    "  file_content,\n",
+    "  True)\n",
+    "\n",
+    "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n",
+    "1,John,Doe,john@example.com,,2024-12-12 10:00:00\\n\n",
+    "\"\"\"\n",
+    "\n",
+    "dbutils.fs.put(\n",
+    "  f\"{customer_snapshot_multifile_path}/customer_2024_12_12_split_0001.csv\",\n",
+    "  file_content,\n",
     "  True)\n"
    ]
   },
diff --git a/src/dataflow/cdc_snapshot.py b/src/dataflow/cdc_snapshot.py
@@ -1,6 +1,8 @@
 import bisect
 from dataclasses import dataclass, field
 from datetime import datetime
+import fnmatch
+import os
 import re
 from typing import Dict, List, Optional, Union
 
@@ -423,7 +425,7 @@ def _get_available_table_versions(self, latest_snapshot_version: Optional[Union[
 
     def _extract_version_from_filename(self, filename: str, file_pattern: str) -> Optional[VersionInfo]:
         """Extract version from filename using pattern"""
-        regex_pattern = re.escape(file_pattern).replace(r'\{version\}', r'(.+)')
+        regex_pattern = re.escape(file_pattern).replace(r'\{version\}', r'(.+?)').replace(r'\{fragment\}', r'.*?')
         match = re.match(regex_pattern, filename)
         if not match or not match.group(1):
             self.logger.debug(f"CDC Snapshot: No version string match found for filename: {filename}")
@@ -481,18 +483,35 @@ def _read_snapshot_dataframe(self, version_info: VersionInfo, dataflow_config: D
 
         if self.sourceType == CDCSnapshotSourceTypes.FILE:
             file_path = self.source.path.replace("{version}", version_info.formatted_value)
-            self.logger.debug(f"CDC Snapshot: Reading file: {file_path}")
-
-            schema_path = self.source.schemaPath
-            select_exp = self.source.selectExp
-
-            df = SourceBatchFiles(
-                path=file_path,
-                format=self.source.format,
-                readerOptions=self.source.readerOptions,
-                schemaPath=schema_path,
-                selectExp=select_exp
-            ).read_source(read_config)
+            
+            if '{fragment}' in file_path:
+                search_pattern = file_path.replace('{fragment}', "*")
+                directory = os.path.dirname(search_pattern)
+                filename_pattern = os.path.basename(search_pattern)
+                dbutils = pipeline_config.get_dbutils()
+                files = [f.path for f in dbutils.fs.ls(directory) if fnmatch.fnmatch(f.name, filename_pattern)]
+            else:
+                files = [file_path]
+            
+            df = None
+            for file in files:
+                self.logger.debug(f"CDC Snapshot: Reading file: {file_path}")
+
+                schema_path = self.source.schemaPath
+                select_exp = self.source.selectExp
+
+                file_df = SourceBatchFiles(
+                    path=file,
+                    format=self.source.format,
+                    readerOptions=self.source.readerOptions,
+                    schemaPath=schema_path,
+                    selectExp=select_exp
+                ).read_source(read_config)
+
+                if df:
+                    df = df.union(file_df)
+                else:
+                    df = file_df
 
             # Apply filter if specified
             if self.source.filter: