Skip to content

Commit c7dee62

Browse files
fix: use flexible datetime parsing in filter_files_by_globs_and_start_date
Replace strict datetime.strptime with ab_datetime_parse in filter_files_by_globs_and_start_date to accept valid ISO8601 dates without microseconds (e.g. 2025-01-01T00:00:00Z). Also handle timezone-naive file.last_modified by assuming UTC for comparison, preventing TypeError on mixed naive/aware comparisons. Closes #920 Co-Authored-By: AJ Steers <aj@airbyte.io>
1 parent 7f41401 commit c7dee62

2 files changed

Lines changed: 37 additions & 3 deletions

File tree

airbyte_cdk/sources/file_based/file_based_stream_reader.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66
import time
77
from abc import ABC, abstractmethod
8-
from datetime import datetime
8+
from datetime import datetime, timezone
99
from enum import Enum
1010
from io import IOBase
1111
from os import makedirs, path
@@ -24,6 +24,7 @@
2424
from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
2525
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
2626
from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile
27+
from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse
2728

2829

2930
class FileReadMode(Enum):
@@ -105,15 +106,20 @@ def filter_files_by_globs_and_start_date(
105106
Utility method for filtering files based on globs.
106107
"""
107108
start_date = (
108-
datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
109+
ab_datetime_parse(self.config.start_date)
109110
if self.config and self.config.start_date
110111
else None
111112
)
112113
seen = set()
113114

114115
for file in files:
115116
if self.file_matches_globs(file, globs):
116-
if file.uri not in seen and (not start_date or file.last_modified >= start_date):
117+
last_modified = (
118+
file.last_modified
119+
if file.last_modified.tzinfo is not None
120+
else file.last_modified.replace(tzinfo=timezone.utc)
121+
)
122+
if file.uri not in seen and (not start_date or last_modified >= start_date):
117123
seen.add(file.uri)
118124
yield file
119125

unit_tests/sources/file_based/test_file_based_stream_reader.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,34 @@ def documentation_url(cls) -> AnyUrl:
408408
set(),
409409
id="all_csvs_modified_exactly_on_start_date",
410410
),
411+
pytest.param(
412+
["**/*.csv"],
413+
{"start_date": "2023-06-01T00:00:00Z", "streams": []},
414+
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
415+
set(),
416+
id="start_date_without_microseconds",
417+
),
418+
pytest.param(
419+
["**/*.csv"],
420+
{"start_date": "2023-06-10T00:00:00Z", "streams": []},
421+
set(),
422+
set(),
423+
id="start_date_without_microseconds_modified_before",
424+
),
425+
pytest.param(
426+
["**/*.csv"],
427+
{"start_date": "2023-06-01T00:00:00+00:00", "streams": []},
428+
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
429+
set(),
430+
id="start_date_with_utc_offset",
431+
),
432+
pytest.param(
433+
["**/*.csv"],
434+
{"start_date": "2023-06-01", "streams": []},
435+
{"a.csv", "a/b.csv", "a/c.csv", "a/b/c.csv", "a/c/c.csv", "a/b/c/d.csv"},
436+
set(),
437+
id="start_date_date_only",
438+
),
411439
],
412440
)
413441
def test_globs_and_prefixes_from_globs(

0 commit comments

Comments
 (0)