|
1 | 1 | import bisect |
2 | 2 | from dataclasses import dataclass, field |
3 | 3 | from datetime import datetime |
| 4 | +import fnmatch |
| 5 | +import os |
4 | 6 | import re |
5 | 7 | from typing import Dict, List, Optional, Union |
6 | 8 |
|
@@ -423,7 +425,7 @@ def _get_available_table_versions(self, latest_snapshot_version: Optional[Union[ |
423 | 425 |
|
424 | 426 | def _extract_version_from_filename(self, filename: str, file_pattern: str) -> Optional[VersionInfo]: |
425 | 427 | """Extract version from filename using pattern""" |
426 | | - regex_pattern = re.escape(file_pattern).replace(r'\{version\}', r'(.+)') |
| 428 | + regex_pattern = re.escape(file_pattern).replace(r'\{version\}', r'(.+?)').replace(r'\{fragment\}', r'.*?') |
427 | 429 | match = re.match(regex_pattern, filename) |
428 | 430 | if not match or not match.group(1): |
429 | 431 | self.logger.debug(f"CDC Snapshot: No version string match found for filename: {filename}") |
@@ -481,18 +483,35 @@ def _read_snapshot_dataframe(self, version_info: VersionInfo, dataflow_config: D |
481 | 483 |
|
482 | 484 | if self.sourceType == CDCSnapshotSourceTypes.FILE: |
483 | 485 | file_path = self.source.path.replace("{version}", version_info.formatted_value) |
484 | | - self.logger.debug(f"CDC Snapshot: Reading file: {file_path}") |
485 | | - |
486 | | - schema_path = self.source.schemaPath |
487 | | - select_exp = self.source.selectExp |
488 | | - |
489 | | - df = SourceBatchFiles( |
490 | | - path=file_path, |
491 | | - format=self.source.format, |
492 | | - readerOptions=self.source.readerOptions, |
493 | | - schemaPath=schema_path, |
494 | | - selectExp=select_exp |
495 | | - ).read_source(read_config) |
| 486 | + |
| 487 | + if '{fragment}' in file_path: |
| 488 | + search_pattern = file_path.replace('{fragment}', "*") |
| 489 | + directory = os.path.dirname(search_pattern) |
| 490 | + filename_pattern = os.path.basename(search_pattern) |
| 491 | + dbutils = pipeline_config.get_dbutils() |
| 492 | + files = [f.path for f in dbutils.fs.ls(directory) if fnmatch.fnmatch(f.name, filename_pattern)] |
| 493 | + else: |
| 494 | + files = [file_path] |
| 495 | + |
| 496 | + df = None |
| 497 | + for file in files: |
| 498 | + self.logger.debug(f"CDC Snapshot: Reading file: {file_path}") |
| 499 | + |
| 500 | + schema_path = self.source.schemaPath |
| 501 | + select_exp = self.source.selectExp |
| 502 | + |
| 503 | + file_df = SourceBatchFiles( |
| 504 | + path=file, |
| 505 | + format=self.source.format, |
| 506 | + readerOptions=self.source.readerOptions, |
| 507 | + schemaPath=schema_path, |
| 508 | + selectExp=select_exp |
| 509 | + ).read_source(read_config) |
| 510 | + |
| 511 | + if df: |
| 512 | + df = df.union(file_df) |
| 513 | + else: |
| 514 | + df = file_df |
496 | 515 |
|
497 | 516 | # Apply filter if specified |
498 | 517 | if self.source.filter: |
|
0 commit comments