feat(cdc): File path regex support (#23)

PremRajendran · web-flow · commit 5d57934b8dda · 2026-03-20T14:12:30.000+11:00
diff --git a/docs/source/dataflow_spec_ref_cdc.rst b/docs/source/dataflow_spec_ref_cdc.rst
@@ -112,7 +112,7 @@ CDC Historical Snapshot Source Configuration
        - The format of the source data. E.g. supported formats are ``table``, ``parquet``, ``csv``, ``json``. All formats supported by spark see `PySpark Data Sources API <https://spark.apache.org/docs/3.5.3/sql-data-sources.html>`_.
      * - **path**
        - ``string``
-       - The location to load the source data from. This can be a table name or a path to a a file or directory with multiple snapshots. A placeholder ``{version}`` can be used in this path which will be substituted with the version value in run time.
+       - The location to load the source data from. This can be a table name or a path to a file or directory with multiple snapshots. Supports three path pattern styles for version extraction: the ``{version}`` placeholder (simple single-segment match), the ``{fragment}`` placeholder (for multi-file snapshots), and regex named capture groups (for complex partitioning). See :ref:`file-path-patterns` for details and examples.
      * - **versionType**
        - ``string``
        - The type of versioning to use. Can be either ``int`` or ``datetime``.
@@ -138,14 +138,102 @@ CDC Historical Snapshot Source Configuration
        - (*optional*) A list of select expressions to apply to the source data.
      * - **filter**
        - ``string``
-       - (*optional*) A filter expression to apply to the source data. This filter is applied to the dataframe as a WHERE clause when the source is read. A placeholder ``{version}`` can be used in this filter expression which will be substituted with the version value in run time.
+       - (*optional*) A filter expression to apply to the source data. This filter is applied to the dataframe as a WHERE clause when the source is read. The placeholder ``{version}`` can be used in this filter expression and will be substituted with the version value at run time (e.g. ``"year = '{version}'"``). Not applicable when using regex named capture groups in ``path``.
      * - **recursiveFileLookup**
        - ``boolean``
        - (*optional*) When set to ``true``, enables recursive directory traversal to find snapshot files. This should be used when snapshots are stored in a nested directory structure such as Hive-style partitioning (e.g., ``/data/{version}/file.parquet``). When set to ``false`` (default), only files in the immediate directory are searched. Default: ``false``.
 
 
   .. note::
-    If ``recursiveFileLookup`` is set to ``true``, ensure that the ``path`` parameter is specified in a way that is compatible with recursive directory traversal. I.e. the ``{version}`` placeholder is used in the path and not the filename.
+    If ``recursiveFileLookup`` is set to ``true``, ensure that the ``path`` parameter is compatible with recursive directory traversal. When using the ``{version}`` placeholder, place it in the directory portion of the path rather than the filename (e.g. ``/data/{version}/file.parquet``). When using regex named capture groups, the pattern spans the full relative path from the first dynamic segment, so ``recursiveFileLookup`` must be ``true`` if the version spans multiple directory levels.
+
+.. _file-path-patterns:
+
+File Path Patterns
+^^^^^^^^^^^^^^^^^^
+
+  The ``path`` field supports three styles for expressing where the version (and optional fragment) appears in the file path. All styles can be combined with a static base path prefix that is resolved at run time (e.g. ``{sample_file_location}``).
+
+  .. list-table::
+     :header-rows: 1
+     :widths: 20 35 45
+
+     * - Style
+       - Syntax
+       - When to Use
+     * - ``{version}`` placeholder
+       - ``{version}``
+       - Version is contained in a single path segment or filename component. Simple and readable for flat or single-level partitioned layouts.
+     * - ``{fragment}`` placeholder
+       - ``{fragment}``
+       - Snapshot data for a single version is split across multiple files. Use alongside ``{version}`` to group files sharing the same version together.
+     * - Regex named capture groups
+       - ``(?P<version_<name>>.+)``
+       - Version is spread across multiple path segments or interleaved with other text. Supports complex partitioning schemes (e.g. Hive-style ``YEAR=.../MONTH=.../DAY=...``) where the version cannot be expressed as a single placeholder.
+
+  **``{version}`` — single-segment version**
+
+  The ``{version}`` placeholder matches one path segment or filename component. It is internally converted to a regex named capture group ``(?P<version_main>.+)``.
+
+  .. code-block:: json
+
+     {
+       "path": "/mnt/data/snapshots/customer_{version}.csv",
+       "versionType": "timestamp",
+       "datetimeFormat": "%Y_%m_%d"
+     }
+
+  Files matched: ``customer_2024_01_01.csv``, ``customer_2024_01_02.csv``, …
+
+  For directory-partitioned layouts, place ``{version}`` in the directory portion and set ``recursiveFileLookup`` to ``true``:
+
+  .. code-block:: json
+
+     {
+       "path": "/mnt/data/snapshots/{version}/customer.csv",
+       "versionType": "timestamp",
+       "datetimeFormat": "YEAR=%Y/MONTH=%m/DAY=%d",
+       "recursiveFileLookup": true
+     }
+
+  Files matched: ``YEAR=2024/MONTH=01/DAY=01/customer.csv``, …
+
+  **``{fragment}`` — multi-file snapshots**
+
+  Use ``{fragment}`` alongside ``{version}`` when a single snapshot version is split across multiple files. All files sharing the same version are read and unioned together before CDC processing.
+
+  .. code-block:: json
+
+     {
+       "path": "/mnt/data/snapshots/customer_{version}_split_{fragment}.csv",
+       "versionType": "timestamp",
+       "datetimeFormat": "%Y_%m_%d"
+     }
+
+  Files matched and grouped by version: ``customer_2024_01_01_split_1.csv``, ``customer_2024_01_01_split_2.csv`` → both ingested as version ``2024-01-01``.
+
+  **Regex named capture groups — multi-segment versions**
+
+  For cases where the version is distributed across multiple directory levels or interleaved with fixed text, use Python regex named capture groups with the prefix ``version_``. All groups whose names start with ``version_`` are extracted and concatenated **in the order they appear in the pattern** (left to right) to form the final version string, which is then parsed according to ``datetimeFormat`` or treated as an integer.
+
+  Group naming convention: ``(?P<version_<name>>.+)``. The ``<name>`` suffix is arbitrary but must be unique within the pattern. The concatenation order is determined by the position of each group in the path expression, not the name.
+
+  .. code-block:: json
+
+     {
+       "path": "/mnt/data/snapshots/(?P<version_year>.+)/(?P<version_month>.+)/data/customer_(?P<version_day>.+).csv",
+       "versionType": "timestamp",
+       "datetimeFormat": "%Y%m%d",
+       "recursiveFileLookup": true
+     }
+
+  For the file ``2024/01/data/customer_15.csv``, the groups are captured left-to-right: ``version_year=2024``, ``version_month=01``, ``version_day=15``. These are concatenated in pattern order to produce ``"20240115"``, which is then parsed with ``datetimeFormat: "%Y%m%d"``.
+
+  .. tip::
+
+     Arrange your ``(?P<version_...>)`` groups in the path from left to right in the same order that their values should be concatenated to match your ``datetimeFormat``. The group names themselves only need to be unique — their order in the pattern controls concatenation.
+
+  See ``samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_regex_main.json`` for a complete working example.
 
   The ``source`` object contains the following properties for ``table`` based sources:
 
diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_regex_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_regex_main.json
@@ -0,0 +1,35 @@
+{
+    "dataFlowId": "feature_historical_files_snapshot_datetime_recursive_and_regex",
+    "dataFlowGroup": "feature_samples_snapshots",
+    "dataFlowType": "standard",
+    "targetFormat": "delta",
+    "targetDetails": {
+        "table": "feature_historical_snapshot_files_datetime_recursive_and_regex",
+        "tableProperties": {
+            "delta.enableChangeDataFeed": "true"
+        },
+        "schemaPath": "target/customer_schema.json",
+        "configFlags": ["disableOperationalMetadata"]
+    },
+    "cdcSnapshotSettings": {
+        "keys": [
+            "CUSTOMER_ID"
+        ],
+        "scd_type": "2",
+        "snapshotType": "historical",
+        "sourceType": "file",
+        "source": {
+            "format": "csv",
+            "path": "{sample_file_location}/snapshot_customer_regex/(?P<version_year>.+)/(?P<version_month>.+)/data/customer_(?P<version_day>.+).csv",
+            "readerOptions": {
+                "header": "true"
+            },
+            "versionType": "timestamp",
+            "datetimeFormat": "%Y%m%d",
+            "recursiveFileLookup": true
+        },
+        "track_history_except_column_list":[
+            "LOAD_TIMESTAMP"
+        ]
+    }
+}
diff --git a/samples/test_data_and_orchestrator/src/initialize.ipynb b/samples/test_data_and_orchestrator/src/initialize.ipynb
@@ -41,6 +41,7 @@
     "customer_file_path = f\"{volume_root_file_path}/customer\"\n",
     "customer_snapshot_file_path = f\"{volume_root_file_path}/snapshot_customer\"\n",
     "customer_snapshot_multifile_path = f\"{volume_root_file_path}/snapshot_customer_multifile\"\n",
+    "customer_snapshot_regex_file_path = f\"{volume_root_file_path}/snapshot_customer_regex\"\n",
     "customer_snapshot_partitioned_file_path = f\"{volume_root_file_path}/snapshot_customer_partitioned\"\n",
     "customer_snapshot_partitioned_parquet_file_path = f\"{volume_root_file_path}/snapshot_customer_partitioned_parquet\"\n",
     "template_samples_base_file_path = f\"{volume_root_file_path}/template_samples\"\n",
diff --git a/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb b/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb
@@ -193,6 +193,7 @@
    "source": [
     "# Delete all files and directories in snapshot directories\n",
     "dbutils.fs.rm(customer_snapshot_file_path, True)\n",
+    "dbutils.fs.rm(customer_snapshot_regex_file_path, True)\n",
     "dbutils.fs.rm(customer_snapshot_partitioned_file_path, True)\n",
     "dbutils.fs.rm(customer_snapshot_partitioned_parquet_file_path, True)\n",
     "dbutils.fs.rm(template_samples_customer_file_path, True)\n",
@@ -219,6 +220,11 @@
     "  True)\n",
     "\n",
     "dbutils.fs.put(\n",
+    "  f\"{customer_snapshot_regex_file_path}/2024/01/data/customer_01.csv\",\n",
+    "  file_content,\n",
+    "  True)\n",
+    "\n",
+    "dbutils.fs.put(\n",
     "  f\"{customer_snapshot_partitioned_file_path}/YEAR=2024/MONTH=01/DAY=01/customer.csv\",\n",
     "  file_content,\n",
     "  True)\n",
diff --git a/src/dataflow/cdc_snapshot.py b/src/dataflow/cdc_snapshot.py