diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index d9b67e34c..6c884f3e1 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -7,12 +7,13 @@ from typing import Any, Dict, List, Literal, Optional, Union import dpath -from pydantic.v1 import AnyUrl, BaseModel, Field +from pydantic.v1 import AnyUrl, BaseModel, Field, validator from airbyte_cdk import OneOfOptionConfig from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions from airbyte_cdk.sources.utils import schema_helpers +from airbyte_cdk.utils.datetime_helpers import ab_datetime_try_parse class DeliverRecords(BaseModel): @@ -53,13 +54,39 @@ class AbstractFileBasedSpec(BaseModel): start_date: Optional[str] = Field( title="Start Date", description="UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", - examples=["2021-01-01T00:00:00.000000Z"], + examples=[ + "2021-01-01", + "2021-01-01T00:00:00Z", + "2021-01-01T00:00:00.000Z", + "2021-01-01T00:00:00.000000Z", + ], format="date-time", - pattern="^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", - pattern_descriptor="YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + pattern=r"^[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?(Z|[+-][0-9]{2}:[0-9]{2})?)?$", + pattern_descriptor="YYYY-MM-DD, YYYY-MM-DDTHH:mm:ssZ, or YYYY-MM-DDTHH:mm:ss.SSSSSSZ", order=1, ) + @validator("start_date", pre=True) + def validate_start_date( + cls, # noqa: N805 # Pydantic validators use cls, not self + v: Optional[str], + ) -> Optional[str]: + """Validate that start_date is a parseable datetime string. + + Uses ab_datetime_try_parse which accepts any common ISO8601/RFC3339 format, + including formats with or without microseconds (e.g., both + '2021-01-01T00:00:00Z' and '2021-01-01T00:00:00.000000Z' are valid). + """ + if v is None: + return v + parsed = ab_datetime_try_parse(v) + if parsed is None: + raise ValueError( + f"'{v}' is not a valid datetime string. " + "Please use a format like '2021-01-01T00:00:00Z' or '2021-01-01T00:00:00.000000Z'." + ) + return v + streams: List[FileBasedStreamConfig] = Field( title="The list of streams to sync", description='Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.', diff --git a/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py b/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py index ec37567a8..57d98da76 100644 --- a/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +++ b/unit_tests/sources/file_based/config/test_abstract_file_based_spec.py @@ -8,6 +8,7 @@ from jsonschema import ValidationError, validate from pydantic.v1 import BaseModel +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec from airbyte_cdk.sources.file_based.config.file_based_stream_config import ( AvroFormat, CsvFormat, @@ -40,3 +41,27 @@ def test_parquet_file_type_is_not_a_valid_csv_file_type( validate(instance=format_config[file_type], schema=file_format.schema()) else: validate(instance=format_config[file_type], schema=file_format.schema()) + + +@pytest.mark.parametrize( + "start_date, should_pass", + [ + pytest.param("2021-01-01T00:00:00.000000Z", True, id="with_microseconds"), + pytest.param("2021-01-01T00:00:00Z", True, id="without_microseconds"), + pytest.param("2021-01-01T00:00:00.000Z", True, id="with_milliseconds"), + pytest.param("2025-01-01T00:00:00Z", True, id="terraform_provider_format"), + pytest.param("2021-01-01T00:00:00+00:00", True, id="with_timezone_offset"), + pytest.param("2021-01-01", True, id="date_only"), + pytest.param(None, True, id="none_value"), + pytest.param("not-a-date", False, id="invalid_string"), + pytest.param("", False, id="empty_string"), + ], +) +def test_start_date_validation(start_date: str, should_pass: bool) -> None: + """Test that start_date accepts various valid ISO8601/RFC3339 formats.""" + if should_pass: + result = AbstractFileBasedSpec.validate_start_date(start_date) + assert result == start_date + else: + with pytest.raises(ValueError, match="is not a valid datetime string"): + AbstractFileBasedSpec.validate_start_date(start_date) diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index f31585412..f16d83e20 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -61,10 +61,15 @@ "start_date": { "title": "Start Date", "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", - "examples": ["2021-01-01T00:00:00.000000Z"], + "examples": [ + "2021-01-01", + "2021-01-01T00:00:00Z", + "2021-01-01T00:00:00.000Z", + "2021-01-01T00:00:00.000000Z", + ], "format": "date-time", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", - "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + "pattern": r"^[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?(Z|[+-][0-9]{2}:[0-9]{2})?)?$", + "pattern_descriptor": "YYYY-MM-DD, YYYY-MM-DDTHH:mm:ssZ, or YYYY-MM-DDTHH:mm:ss.SSSSSSZ", "order": 1, "type": "string", },