From 972cc60fe8c6e8a02a73525c8a372bc78aeb11e2 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 17:32:18 -0400 Subject: [PATCH 01/14] minor improvements to the development flow int the cdk. automatically format python files after generating pydnatic models, upgrade dagger version, install dagger normally as a dev poetry dependency, allow specifying a custom manifest file path in IDE debug configuration. --- bin/generate-component-manifest-dagger.sh | 1 - debug_manifest/README.md | 6 +- debug_manifest/debug_manifest.py | 4 +- poetry.lock | 219 +++++++++++++++++++--- pyproject.toml | 3 +- 5 files changed, 204 insertions(+), 29 deletions(-) diff --git a/bin/generate-component-manifest-dagger.sh b/bin/generate-component-manifest-dagger.sh index f920ff727..908f92a6c 100755 --- a/bin/generate-component-manifest-dagger.sh +++ b/bin/generate-component-manifest-dagger.sh @@ -7,5 +7,4 @@ set -e -pip install dagger-io==0.13.3 python bin/generate_component_manifest_files.py diff --git a/debug_manifest/README.md b/debug_manifest/README.md index b5b2eeb2f..44c9ab225 100644 --- a/debug_manifest/README.md +++ b/debug_manifest/README.md @@ -22,11 +22,15 @@ To configure the debugger in VSCode to run the `debug_manifest`, follow these st "request": "launch", "console": "integratedTerminal", "cwd": "${workspaceFolder}/debug_manifest", - "python": "/bin/python", + "python": "/bin/python", // REPLACE ME "module": "debug_manifest", "args": [ // SPECIFY THE COMMAND: [spec, check, discover, read] "read", + // SPECIFY THE MANIFEST FILE + "--manifest-path", + // PATH TO THE MANIFEST FILE + "resources/manifest.yaml", // SPECIFY THE CONFIG "--config", // PATH TO THE CONFIG FILE diff --git a/debug_manifest/debug_manifest.py b/debug_manifest/debug_manifest.py index 2b544fc70..6c25dde11 100644 --- a/debug_manifest/debug_manifest.py +++ b/debug_manifest/debug_manifest.py @@ -24,13 +24,15 @@ def debug_manifest(source: YamlDeclarativeSource, args: list[str]) -> None: if __name__ == "__main__": args = sys.argv[1:] + parsed_args = AirbyteEntrypoint.parse_args(args) + manifest_path = AirbyteEntrypoint.parse_args(args).manifest_path or "resources/manifest.yaml" catalog_path = AirbyteEntrypoint.extract_catalog(args) config_path = AirbyteEntrypoint.extract_config(args) state_path = AirbyteEntrypoint.extract_state(args) debug_manifest( YamlDeclarativeSource( - path_to_yaml="resources/manifest.yaml", + path_to_yaml=manifest_path, catalog=YamlDeclarativeSource.read_catalog(catalog_path) if catalog_path else None, config=YamlDeclarativeSource.read_config(config_path) if config_path else None, state=YamlDeclarativeSource.read_state(state_path) if state_path else None, diff --git a/poetry.lock b/poetry.lock index 862c26076..91849a96d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -233,7 +233,7 @@ version = "24.3.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"}, {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"}, @@ -270,12 +270,31 @@ version = "2.2.1" description = "Function decoration for backoff and retry" optional = false python-versions = ">=3.7,<4.0" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] +[[package]] +name = "beartype" +version = "0.21.0" +description = "Unbearably fast near-real-time hybrid runtime-static type-checking in pure Python." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "beartype-0.21.0-py3-none-any.whl", hash = "sha256:b6a1bd56c72f31b0a496a36cc55df6e2f475db166ad07fa4acc7e74f4c7f34c0"}, + {file = "beartype-0.21.0.tar.gz", hash = "sha256:f9a5078f5ce87261c2d22851d19b050b64f6a805439e8793aecf01ce660d3244"}, +] + +[package.extras] +dev = ["autoapi (>=0.9.0)", "click", "coverage (>=5.5)", "equinox ; sys_platform == \"linux\"", "jax[cpu] ; sys_platform == \"linux\"", "jaxtyping ; sys_platform == \"linux\"", "langchain", "mypy (>=0.800) ; platform_python_implementation != \"PyPy\"", "nuitka (>=1.2.6) ; sys_platform == \"linux\"", "numba ; python_version < \"3.13.0\"", "numpy ; sys_platform != \"darwin\" and platform_python_implementation != \"PyPy\"", "pandera", "pydata-sphinx-theme (<=0.7.2)", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "rich-click", "sphinx", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)", "sqlalchemy", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)", "xarray"] +doc-rtd = ["autoapi (>=0.9.0)", "pydata-sphinx-theme (<=0.7.2)", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)"] +test = ["click", "coverage (>=5.5)", "equinox ; sys_platform == \"linux\"", "jax[cpu] ; sys_platform == \"linux\"", "jaxtyping ; sys_platform == \"linux\"", "langchain", "mypy (>=0.800) ; platform_python_implementation != \"PyPy\"", "nuitka (>=1.2.6) ; sys_platform == \"linux\"", "numba ; python_version < \"3.13.0\"", "numpy ; sys_platform != \"darwin\" and platform_python_implementation != \"PyPy\"", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "rich-click", "sphinx", "sqlalchemy", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)", "xarray"] +test-tox = ["click", "equinox ; sys_platform == \"linux\"", "jax[cpu] ; sys_platform == \"linux\"", "jaxtyping ; sys_platform == \"linux\"", "langchain", "mypy (>=0.800) ; platform_python_implementation != \"PyPy\"", "nuitka (>=1.2.6) ; sys_platform == \"linux\"", "numba ; python_version < \"3.13.0\"", "numpy ; sys_platform != \"darwin\" and platform_python_implementation != \"PyPy\"", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "rich-click", "sphinx", "sqlalchemy", "typing-extensions (>=3.10.0.0)", "xarray"] +test-tox-coverage = ["coverage (>=5.5)"] + [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -354,7 +373,7 @@ version = "24.1.2" description = "Composable complex class support for attrs and dataclasses." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "cattrs-24.1.2-py3-none-any.whl", hash = "sha256:67c7495b760168d931a10233f979b28dc04daf853b30752246f4f8471c6d68d0"}, {file = "cattrs-24.1.2.tar.gz", hash = "sha256:8028cfe1ff5382df59dd36474a86e02d817b06eaf8af84555441bac915d2ef85"}, @@ -967,6 +986,29 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "dagger-io" +version = "0.18.6" +description = "A client package for running Dagger pipelines in Python." +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "dagger_io-0.18.6-py3-none-any.whl", hash = "sha256:357e426ce42ec5b27da87a9e3145e6b5ba5fd02dbbad317ecc1ff09c1e1a44cd"}, + {file = "dagger_io-0.18.6.tar.gz", hash = "sha256:165c1c16165feb12cfdd89fce50c898e6ebc5a9acaaa0b725569317c404e45ce"}, +] + +[package.dependencies] +anyio = ">=3.6.2" +beartype = ">=0.18.2" +cattrs = ">=24.1.0" +gql = {version = ">=3.5.0", extras = ["httpx"]} +opentelemetry-exporter-otlp-proto-http = ">=1.23.0" +opentelemetry-sdk = ">=1.23.0" +platformdirs = ">=2.6.2" +rich = ">=10.11.0" +typing-extensions = ">=4.13.0" + [[package]] name = "dataclasses-json" version = "0.6.7" @@ -1680,7 +1722,7 @@ version = "1.70.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, @@ -1693,6 +1735,49 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4 [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0)"] +[[package]] +name = "gql" +version = "4.0.0" +description = "GraphQL client for Python" +optional = false +python-versions = ">=3.8.1" +groups = ["dev"] +files = [ + {file = "gql-4.0.0-py3-none-any.whl", hash = "sha256:f3beed7c531218eb24d97cb7df031b4a84fdb462f4a2beb86e2633d395937479"}, + {file = "gql-4.0.0.tar.gz", hash = "sha256:f22980844eb6a7c0266ffc70f111b9c7e7c7c13da38c3b439afc7eab3d7c9c8e"}, +] + +[package.dependencies] +anyio = ">=3.0,<5" +backoff = ">=1.11.1,<3.0" +graphql-core = ">=3.2,<3.3" +httpx = {version = ">=0.27.0,<1", optional = true, markers = "extra == \"httpx\""} +yarl = ">=1.6,<2.0" + +[package.extras] +aiofiles = ["aiofiles"] +aiohttp = ["aiohttp (>=3.11.2,<4)"] +all = ["aiofiles", "aiohttp (>=3.11.2,<4)", "botocore (>=1.21,<2)", "httpx (>=0.27.0,<1)", "requests (>=2.26,<3)", "requests_toolbelt (>=1.0.0,<2)", "websockets (>=14.2,<16)"] +botocore = ["botocore (>=1.21,<2)"] +dev = ["aiofiles", "aiofiles", "aiohttp (>=3.11.2,<4)", "black (==25.1.0)", "botocore (>=1.21,<2)", "check-manifest (>=0.42,<1)", "flake8 (==7.1.2)", "httpx (>=0.27.0,<1)", "isort (==6.0.1)", "mypy (==1.15)", "parse (==1.20.2)", "pytest (==8.3.4)", "pytest-asyncio (==0.25.3)", "pytest-console-scripts (==1.4.1)", "pytest-cov (==6.0.0)", "requests (>=2.26,<3)", "requests_toolbelt (>=1.0.0,<2)", "sphinx (>=7.0.0,<8) ; python_version <= \"3.9\"", "sphinx (>=8.1.0,<9) ; python_version > \"3.9\"", "sphinx-argparse (==0.5.2)", "sphinx_rtd_theme (>=3.0.2,<4)", "types-aiofiles", "types-requests", "vcrpy (==7.0.0)", "websockets (>=14.2,<16)"] +httpx = ["httpx (>=0.27.0,<1)"] +requests = ["requests (>=2.26,<3)", "requests_toolbelt (>=1.0.0,<2)"] +test = ["aiofiles", "aiofiles", "aiohttp (>=3.11.2,<4)", "botocore (>=1.21,<2)", "httpx (>=0.27.0,<1)", "parse (==1.20.2)", "pytest (==8.3.4)", "pytest-asyncio (==0.25.3)", "pytest-console-scripts (==1.4.1)", "pytest-cov (==6.0.0)", "requests (>=2.26,<3)", "requests_toolbelt (>=1.0.0,<2)", "vcrpy (==7.0.0)", "websockets (>=14.2,<16)"] +test-no-transport = ["aiofiles", "parse (==1.20.2)", "pytest (==8.3.4)", "pytest-asyncio (==0.25.3)", "pytest-console-scripts (==1.4.1)", "pytest-cov (==6.0.0)", "vcrpy (==7.0.0)"] +websockets = ["websockets (>=14.2,<16)"] + +[[package]] +name = "graphql-core" +version = "3.2.6" +description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." +optional = false +python-versions = "<4,>=3.6" +groups = ["dev"] +files = [ + {file = "graphql_core-3.2.6-py3-none-any.whl", hash = "sha256:78b016718c161a6fb20a7d97bbf107f331cd1afe53e45566c59f776ed7f0b45f"}, + {file = "graphql_core-3.2.6.tar.gz", hash = "sha256:c08eec22f9e40f0bd61d805907e3b3b1b9a320bc606e23dc145eebca07c8fbab"}, +] + [[package]] name = "greenlet" version = "3.1.1" @@ -2032,14 +2117,14 @@ all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2 name = "importlib-metadata" version = "8.7.0" description = "Read metadata from Python packages" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"manifest-server\"" +groups = ["main", "dev"] files = [ {file = "importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd"}, {file = "importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000"}, ] +markers = {main = "extra == \"manifest-server\""} [package.dependencies] zipp = ">=3.20" @@ -2929,10 +3014,9 @@ test = ["Cython", "greenlet ; python_version < \"3.14\"", "ipython", "packaging" name = "multidict" version = "6.1.0" description = "multidict implementation" -optional = true +optional = false python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"vector-db-based\"" +groups = ["main", "dev"] files = [ {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"}, {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"}, @@ -3027,6 +3111,7 @@ files = [ {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"}, {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"}, ] +markers = {main = "extra == \"vector-db-based\""} [package.dependencies] typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} @@ -3229,19 +3314,103 @@ et-xmlfile = "*" name = "opentelemetry-api" version = "1.36.0" description = "OpenTelemetry Python API" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"manifest-server\"" +groups = ["main", "dev"] files = [ {file = "opentelemetry_api-1.36.0-py3-none-any.whl", hash = "sha256:02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c"}, {file = "opentelemetry_api-1.36.0.tar.gz", hash = "sha256:9a72572b9c416d004d492cbc6e61962c0501eaf945ece9b5a0f56597d8348aa0"}, ] +markers = {main = "extra == \"manifest-server\""} [package.dependencies] importlib-metadata = ">=6.0,<8.8.0" typing-extensions = ">=4.5.0" +[[package]] +name = "opentelemetry-exporter-otlp-proto-common" +version = "1.36.0" +description = "OpenTelemetry Protobuf encoding" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "opentelemetry_exporter_otlp_proto_common-1.36.0-py3-none-any.whl", hash = "sha256:0fc002a6ed63eac235ada9aa7056e5492e9a71728214a61745f6ad04b923f840"}, + {file = "opentelemetry_exporter_otlp_proto_common-1.36.0.tar.gz", hash = "sha256:6c496ccbcbe26b04653cecadd92f73659b814c6e3579af157d8716e5f9f25cbf"}, +] + +[package.dependencies] +opentelemetry-proto = "1.36.0" + +[[package]] +name = "opentelemetry-exporter-otlp-proto-http" +version = "1.36.0" +description = "OpenTelemetry Collector Protobuf over HTTP Exporter" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "opentelemetry_exporter_otlp_proto_http-1.36.0-py3-none-any.whl", hash = "sha256:3d769f68e2267e7abe4527f70deb6f598f40be3ea34c6adc35789bea94a32902"}, + {file = "opentelemetry_exporter_otlp_proto_http-1.36.0.tar.gz", hash = "sha256:dd3637f72f774b9fc9608ab1ac479f8b44d09b6fb5b2f3df68a24ad1da7d356e"}, +] + +[package.dependencies] +googleapis-common-protos = ">=1.52,<2.0" +opentelemetry-api = ">=1.15,<2.0" +opentelemetry-exporter-otlp-proto-common = "1.36.0" +opentelemetry-proto = "1.36.0" +opentelemetry-sdk = ">=1.36.0,<1.37.0" +requests = ">=2.7,<3.0" +typing-extensions = ">=4.5.0" + +[[package]] +name = "opentelemetry-proto" +version = "1.36.0" +description = "OpenTelemetry Python Proto" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "opentelemetry_proto-1.36.0-py3-none-any.whl", hash = "sha256:151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e"}, + {file = "opentelemetry_proto-1.36.0.tar.gz", hash = "sha256:0f10b3c72f74c91e0764a5ec88fd8f1c368ea5d9c64639fb455e2854ef87dd2f"}, +] + +[package.dependencies] +protobuf = ">=5.0,<7.0" + +[[package]] +name = "opentelemetry-sdk" +version = "1.36.0" +description = "OpenTelemetry Python SDK" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "opentelemetry_sdk-1.36.0-py3-none-any.whl", hash = "sha256:19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb"}, + {file = "opentelemetry_sdk-1.36.0.tar.gz", hash = "sha256:19c8c81599f51b71670661ff7495c905d8fdf6976e41622d5245b791b06fa581"}, +] + +[package.dependencies] +opentelemetry-api = "1.36.0" +opentelemetry-semantic-conventions = "0.57b0" +typing-extensions = ">=4.5.0" + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.57b0" +description = "OpenTelemetry Semantic Conventions" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl", hash = "sha256:757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78"}, + {file = "opentelemetry_semantic_conventions-0.57b0.tar.gz", hash = "sha256:609a4a79c7891b4620d64c7aac6898f872d790d75f22019913a660756f27ff32"}, +] + +[package.dependencies] +opentelemetry-api = "1.36.0" +typing-extensions = ">=4.5.0" + [[package]] name = "orjson" version = "3.10.15" @@ -3677,10 +3846,9 @@ poetry-plugin = ["poetry (>=1.0,<2.0)"] name = "propcache" version = "0.2.1" description = "Accelerated property cache" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"vector-db-based\"" +groups = ["main", "dev"] files = [ {file = "propcache-0.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6b3f39a85d671436ee3d12c017f8fdea38509e4f25b28eb25877293c98c243f6"}, {file = "propcache-0.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d51fbe4285d5db5d92a929e3e21536ea3dd43732c5b177c7ef03f918dff9f2"}, @@ -3765,6 +3933,7 @@ files = [ {file = "propcache-0.2.1-py3-none-any.whl", hash = "sha256:52277518d6aae65536e9cea52d4e7fd2f7a66f4aa2d30ed3f2fcea620ace3c54"}, {file = "propcache-0.2.1.tar.gz", hash = "sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64"}, ] +markers = {main = "extra == \"vector-db-based\""} [[package]] name = "proto-plus" @@ -3790,7 +3959,7 @@ version = "5.29.4" description = "" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"}, {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"}, @@ -6311,10 +6480,9 @@ files = [ name = "yarl" version = "1.18.3" description = "Yet another URL library" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"vector-db-based\"" +groups = ["main", "dev"] files = [ {file = "yarl-1.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34"}, {file = "yarl-1.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7"}, @@ -6399,6 +6567,7 @@ files = [ {file = "yarl-1.18.3-py3-none-any.whl", hash = "sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b"}, {file = "yarl-1.18.3.tar.gz", hash = "sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1"}, ] +markers = {main = "extra == \"vector-db-based\""} [package.dependencies] idna = ">=2.0" @@ -6409,14 +6578,14 @@ propcache = ">=0.2.0" name = "zipp" version = "3.23.0" description = "Backport of pathlib-compatible object wrapper for zip files" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"manifest-server\"" +groups = ["main", "dev"] files = [ {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"}, {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"}, ] +markers = {main = "extra == \"manifest-server\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] @@ -6436,4 +6605,4 @@ vector-db-based = ["cohere", "langchain", "openai", "tiktoken"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "d8fd5c55f5250ab2f93c08c0831f2034a4dc5a13ace62f78c322acf575cb7ba7" +content-hash = "b03f6ecb7f72dc2b1669bfc1aa0b907ea70cc8c656230db0859de5a2da475783" diff --git a/pyproject.toml b/pyproject.toml index 5c2c06850..62bdee0a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,6 +120,7 @@ types-python-dateutil = "^2.9.0.20241003" types-pyyaml = "^6.0.12.20240917" types-cachetools = "^5.5.0.20240820" deptry = "^0.23.0" +dagger-io = "0.18.6" [tool.poetry.extras] file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"] @@ -147,7 +148,7 @@ pre-commit = {cmd = "poetry run pre-commit run --all-files", help = "Run all pre # Build tasks assemble = {cmd = "bin/generate-component-manifest-dagger.sh", help = "Generate component manifest files."} build-package = {cmd = "poetry build", help = "Build the python package: source and wheels archives."} -build = {sequence = ["assemble", "openapi-generate", "build-package"], help = "Run all tasks to build the package."} +build = {sequence = ["assemble", "openapi-generate", "build-package", "ruff-fix"], help = "Run all tasks to build the package."} # Format check tasks format-check = {sequence = ["_format-check-ruff", "_format-check-prettier"], help = "Check formatting for all file types via Ruff and Prettier.", ignore_fail = "return_non_zero"} From 598fe31c8473ad93bcb8bf80a2c61b8ab1f37b8f Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:57:53 -0700 Subject: [PATCH 02/14] Update debug_manifest/debug_manifest.py --- debug_manifest/debug_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug_manifest/debug_manifest.py b/debug_manifest/debug_manifest.py index 6c25dde11..8cebe65ff 100644 --- a/debug_manifest/debug_manifest.py +++ b/debug_manifest/debug_manifest.py @@ -25,7 +25,7 @@ def debug_manifest(source: YamlDeclarativeSource, args: list[str]) -> None: if __name__ == "__main__": args = sys.argv[1:] parsed_args = AirbyteEntrypoint.parse_args(args) - manifest_path = AirbyteEntrypoint.parse_args(args).manifest_path or "resources/manifest.yaml" + manifest_path = parsed_args.manifest_path or "resources/manifest.yaml" catalog_path = AirbyteEntrypoint.extract_catalog(args) config_path = AirbyteEntrypoint.extract_config(args) state_path = AirbyteEntrypoint.extract_state(args) From 42d6476f369c30fa7406ecf1853f4cf1f203769b Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 18:15:27 -0400 Subject: [PATCH 03/14] handle case where manifest_file is not an argument --- debug_manifest/debug_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug_manifest/debug_manifest.py b/debug_manifest/debug_manifest.py index 8cebe65ff..b87b00ecd 100644 --- a/debug_manifest/debug_manifest.py +++ b/debug_manifest/debug_manifest.py @@ -25,7 +25,7 @@ def debug_manifest(source: YamlDeclarativeSource, args: list[str]) -> None: if __name__ == "__main__": args = sys.argv[1:] parsed_args = AirbyteEntrypoint.parse_args(args) - manifest_path = parsed_args.manifest_path or "resources/manifest.yaml" + manifest_path = getattr(parsed_args, "manifest_path", None) or "resources/manifest.yaml" catalog_path = AirbyteEntrypoint.extract_catalog(args) config_path = AirbyteEntrypoint.extract_config(args) state_path = AirbyteEntrypoint.extract_state(args) From a22b03f2b4be939d83f4ab57b0c3b3ba15de78a6 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 18:17:34 -0400 Subject: [PATCH 04/14] remove unnecessary code --- debug_manifest/debug_manifest.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/debug_manifest/debug_manifest.py b/debug_manifest/debug_manifest.py index b87b00ecd..45265dee9 100644 --- a/debug_manifest/debug_manifest.py +++ b/debug_manifest/debug_manifest.py @@ -3,17 +3,12 @@ # import sys -from typing import Any, Mapping from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch from airbyte_cdk.sources.declarative.yaml_declarative_source import ( YamlDeclarativeSource, ) -configuration: Mapping[str, Any] = { - "path_to_yaml": "resources/manifest.yaml", -} - def debug_manifest(source: YamlDeclarativeSource, args: list[str]) -> None: """ From be8d864a98156aca707b4c1081e668ed6d95a3ca Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 22:51:49 -0400 Subject: [PATCH 05/14] add additional contexts to download requester --- airbyte_cdk/sources/declarative/requesters/README.md | 3 ++- .../sources/declarative/requesters/http_job_repository.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/requesters/README.md b/airbyte_cdk/sources/declarative/requesters/README.md index cfeaf7e76..f91937103 100644 --- a/airbyte_cdk/sources/declarative/requesters/README.md +++ b/airbyte_cdk/sources/declarative/requesters/README.md @@ -1,6 +1,7 @@ # AsyncHttpJobRepository sequence diagram - Components marked as optional are not required and can be ignored. +- # TODO update - if `download_target_requester` is not provided, `download_target_extractor` will get urls from the `polling_response` - interpolation_context, e.g. `creation_response` or `polling_response` can be obtained from stream_slice @@ -37,7 +38,7 @@ sequenceDiagram UrlRequester -->> AsyncHttpJobRepository: Download URLs AsyncHttpJobRepository ->> DownloadRetriever: Download reports - DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`) + DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`, `creation_response`, `polling_response`) Reporting Server -->> DownloadRetriever: Report data DownloadRetriever -->> AsyncHttpJobRepository: Report data else Status: Failed diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py index 2ca38494e..2eb4e794c 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py +++ b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py @@ -221,6 +221,8 @@ def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]: extra_fields={ **job_slice.extra_fields, "download_target": target_url, + "creation_response": self._get_creation_response_interpolation_context(job), + "polling_response": self._get_polling_response_interpolation_context(job), }, ) for message in self.download_retriever.read_records({}, stream_slice): From e33fd59e8f4d983e100dfe7c5b12b72aef8512cb Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 22:52:11 -0400 Subject: [PATCH 06/14] make download_target_extractor optional in schema --- .../sources/declarative/declarative_component_schema.yaml | 3 +-- .../declarative/models/declarative_component_schema.py | 8 +++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 888989b13..cb7e7abbc 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -3797,7 +3797,6 @@ definitions: - polling_requester - download_requester - status_extractor - - download_target_extractor properties: type: type: string @@ -3815,7 +3814,7 @@ definitions: - "$ref": "#/definitions/DpathExtractor" - "$ref": "#/definitions/CustomRecordExtractor" download_target_extractor: - description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job. + description: Responsible for fetching the information needed to download the completed job from the polling HTTP response. anyOf: - "$ref": "#/definitions/DpathExtractor" - "$ref": "#/definitions/CustomRecordExtractor" diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index ccd2e9e8d..880fbda67 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1,5 +1,3 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - # generated by datamodel-codegen: # filename: declarative_component_schema.yaml @@ -2852,9 +2850,9 @@ class AsyncRetriever(BaseModel): status_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field( ..., description="Responsible for fetching the actual status of the async job." ) - download_target_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field( - ..., - description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.", + download_target_extractor: Optional[Union[DpathExtractor, CustomRecordExtractor]] = Field( + None, + description="Responsible for fetching the information needed to download the completed job from the polling HTTP response.", ) download_extractor: Optional[ Union[DpathExtractor, CustomRecordExtractor, ResponseToFileExtractor] From fab80332290625388fa9ae7ba50e8fa223d0c752 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 23:22:06 -0400 Subject: [PATCH 07/14] make download_target_extractor optional --- .../parsers/model_to_component_factory.py | 14 ++++++---- .../requesters/http_job_repository.py | 26 ++++++++++++++----- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4a73dced3..98490a4e6 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3603,11 +3603,15 @@ def _get_job_timeout() -> datetime.timedelta: status_extractor = self._create_component_from_model( model=model.status_extractor, decoder=decoder, config=config, name=name ) - download_target_extractor = self._create_component_from_model( - model=model.download_target_extractor, - decoder=decoder, - config=config, - name=name, + download_target_extractor = ( + self._create_component_from_model( + model=model.download_target_extractor, + decoder=decoder, + config=config, + name=name, + ) + if model.download_target_extractor + else None ) job_repository: AsyncJobRepository = AsyncHttpJobRepository( diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py index 2eb4e794c..93e9125f4 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py +++ b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py @@ -43,7 +43,7 @@ class AsyncHttpJobRepository(AsyncJobRepository): delete_requester: Optional[Requester] status_extractor: DpathExtractor status_mapping: Mapping[str, AsyncJobStatus] - download_target_extractor: DpathExtractor + download_target_extractor: Optional[DpathExtractor] # timeout for the job to be completed, passed from `polling_job_timeout` job_timeout: Optional[timedelta] = None @@ -213,14 +213,14 @@ def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]: """ - for target_url in self._get_download_targets(job): + for download_target in self._get_download_targets(job): job_slice = job.job_parameters() stream_slice = StreamSlice( partition=job_slice.partition, cursor_slice=job_slice.cursor_slice, extra_fields={ **job_slice.extra_fields, - "download_target": target_url, + "download_target": download_target, "creation_response": self._get_creation_response_interpolation_context(job), "polling_response": self._get_polling_response_interpolation_context(job), }, @@ -332,9 +332,18 @@ def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice: ) def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: - if not self.download_target_requester: - url_response = self._polling_job_response_by_id[job.api_job_id()] - else: + """Returns an iterable of strings to help target requests for downloading async jobs.""" + # If neither download_target_extractor nor download_target_requester are provided,return a single empty string + # to express the need to make a single download request without any download_target value + if not self.download_target_extractor and not self.download_target_requester: + lazy_log( + LOGGER, + logging.DEBUG, + lambda: "No download_target_extractor or download_target_requester provided. Using fallback behavior for single download request without download_target.", + ) + return [""] + + if self.download_target_requester: stream_slice: StreamSlice = StreamSlice( partition={}, cursor_slice={}, @@ -348,5 +357,8 @@ def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: internal_message="Always expect a response or an exception from download_target_requester", failure_type=FailureType.system_error, ) - + else: + # if no download_target_requester is provided, we extract directly from the polling response + url_response = self._polling_job_response_by_id[job.api_job_id()] + yield from self.download_target_extractor.extract_records(url_response) # type: ignore # we expect download_target_extractor to always return list of strings From 7a25306c4596dc92150bc33eca02a6b663ffd22f Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 20:35:30 -0700 Subject: [PATCH 08/14] Update airbyte_cdk/sources/declarative/requesters/http_job_repository.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../sources/declarative/requesters/http_job_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py index 93e9125f4..5b7e89395 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py +++ b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py @@ -333,7 +333,7 @@ def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice: def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: """Returns an iterable of strings to help target requests for downloading async jobs.""" - # If neither download_target_extractor nor download_target_requester are provided,return a single empty string + # If neither download_target_extractor nor download_target_requester are provided, return a single empty string # to express the need to make a single download request without any download_target value if not self.download_target_extractor and not self.download_target_requester: lazy_log( From 20914b802aae0d61c0b2c800deef0bf8d3fc9200 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Tue, 9 Sep 2025 23:38:43 -0400 Subject: [PATCH 09/14] linting --- .../sources/declarative/requesters/http_job_repository.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py index 93e9125f4..e4cbdec56 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py +++ b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py @@ -333,7 +333,7 @@ def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice: def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: """Returns an iterable of strings to help target requests for downloading async jobs.""" - # If neither download_target_extractor nor download_target_requester are provided,return a single empty string + # If neither download_target_extractor nor download_target_requester are provided,return a single empty string # to express the need to make a single download request without any download_target value if not self.download_target_extractor and not self.download_target_requester: lazy_log( @@ -360,5 +360,5 @@ def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: else: # if no download_target_requester is provided, we extract directly from the polling response url_response = self._polling_job_response_by_id[job.api_job_id()] - + yield from self.download_target_extractor.extract_records(url_response) # type: ignore # we expect download_target_extractor to always return list of strings From c67d4aefd40f5b7f94a58e8aa7a56f85efac0f9f Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:08:04 -0400 Subject: [PATCH 10/14] clean up error handling for http_job_repository --- .../requesters/http_job_repository.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py index 12a16e3a5..d837ed902 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py +++ b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py @@ -333,17 +333,26 @@ def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice: def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: """Returns an iterable of strings to help target requests for downloading async jobs.""" - # If neither download_target_extractor nor download_target_requester are provided, return a single empty string + # If neither download_target_extractor nor download_target_requester are provided, yield a single empty string # to express the need to make a single download request without any download_target value - if not self.download_target_extractor and not self.download_target_requester: - lazy_log( - LOGGER, - logging.DEBUG, - lambda: "No download_target_extractor or download_target_requester provided. Using fallback behavior for single download request without download_target.", - ) - return [""] + if not self.download_target_extractor: + if not self.download_target_requester: + lazy_log( + LOGGER, + logging.DEBUG, + lambda: "No download_target_extractor or download_target_requester provided. Will attempt a single download request without a `download_target`.", + ) + yield "" + return + else: + raise AirbyteTracedException( + internal_message="Must define a `download_target_extractor` when using a `download_target_requester`.", + failure_type=FailureType.config_error, + ) + # We have a download_target_extractor, use it to extract the donload_target if self.download_target_requester: + # if a download_target_requester if defined, we extract from the response of a request specifically for download targets. stream_slice: StreamSlice = StreamSlice( partition={}, cursor_slice={}, @@ -358,7 +367,7 @@ def _get_download_targets(self, job: AsyncJob) -> Iterable[str]: failure_type=FailureType.system_error, ) else: - # if no download_target_requester is provided, we extract directly from the polling response + # if no download_target_requester is defined, we extract from the polling response url_response = self._polling_job_response_by_id[job.api_job_id()] yield from self.download_target_extractor.extract_records(url_response) # type: ignore # we expect download_target_extractor to always return list of strings From 31b29b9e98fa273557d2735231b27f9c6efa99b2 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:28:22 -0400 Subject: [PATCH 11/14] enforce dependency between fields in model to component factory --- .../declarative/parsers/model_to_component_factory.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 98490a4e6..cf792a67e 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3446,6 +3446,11 @@ def create_async_retriever( transformations: List[RecordTransformation], **kwargs: Any, ) -> AsyncRetriever: + if model.download_target_requester and not model.download_target_extractor: + raise ValueError( + f"`download_target_extractor` required if using a `download_target_requester`" + ) + def _get_download_retriever( requester: Requester, extractor: RecordExtractor, _decoder: Decoder ) -> SimpleRetriever: From c3139aaa2b1d0126b14919a08191e6df2a9bd3d3 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:49:11 -0400 Subject: [PATCH 12/14] update README --- .../declarative_component_schema.yaml | 2 +- .../models/declarative_component_schema.py | 2 ++ .../sources/declarative/requesters/README.md | 18 ++++++++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index cb7e7abbc..bb16149be 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1593,7 +1593,7 @@ definitions: - "$ref": "#/definitions/HttpRequester" - "$ref": "#/definitions/CustomRequester" download_target_extractor: - description: Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response + description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job. anyOf: - "$ref": "#/definitions/DpathExtractor" - "$ref": "#/definitions/CustomRecordExtractor" diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 880fbda67..2b71a97bd 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. + # generated by datamodel-codegen: # filename: declarative_component_schema.yaml diff --git a/airbyte_cdk/sources/declarative/requesters/README.md b/airbyte_cdk/sources/declarative/requesters/README.md index f91937103..096081dc7 100644 --- a/airbyte_cdk/sources/declarative/requesters/README.md +++ b/airbyte_cdk/sources/declarative/requesters/README.md @@ -1,9 +1,19 @@ +# Download Target and Download Requester + +- The `creation_response` and `polling_response` interpolation contexts are always available during the job download step of the process. + +- The`download_target` interpolation context is generated by the `download_target_extractor` and made available to the job download step as well. + - if `download_target_requester` is not provided, `download_target_extractor` will get urls from the `polling_response` + - if `download_target_requester` is provided, an additional request will be made to fetch job download targets and `download_target_extractor` will operate on that response + +## Some important considerations + +- **Note:** If the `download_target_extractor` and `download_target_requester` are not defined, a single job download request will be made without the `download_target` context. +- **Note:** The `download_target_extractor` is required (not optional) if using a `download_target_requester` + # AsyncHttpJobRepository sequence diagram - Components marked as optional are not required and can be ignored. -- # TODO update -- if `download_target_requester` is not provided, `download_target_extractor` will get urls from the `polling_response` -- interpolation_context, e.g. `creation_response` or `polling_response` can be obtained from stream_slice ```mermaid --- @@ -38,7 +48,7 @@ sequenceDiagram UrlRequester -->> AsyncHttpJobRepository: Download URLs AsyncHttpJobRepository ->> DownloadRetriever: Download reports - DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`, `creation_response`, `polling_response`) + DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `download_target`, `creation_response`, `polling_response`) Reporting Server -->> DownloadRetriever: Report data DownloadRetriever -->> AsyncHttpJobRepository: Report data else Status: Failed From b694b5b704fa4798b823ec31d84e88b55b8c6530 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:57:47 -0400 Subject: [PATCH 13/14] undo mistake in descriptions --- .../sources/declarative/declarative_component_schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index bb16149be..50e80b601 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1593,7 +1593,7 @@ definitions: - "$ref": "#/definitions/HttpRequester" - "$ref": "#/definitions/CustomRequester" download_target_extractor: - description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job. + description: Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response anyOf: - "$ref": "#/definitions/DpathExtractor" - "$ref": "#/definitions/CustomRecordExtractor" @@ -3814,7 +3814,7 @@ definitions: - "$ref": "#/definitions/DpathExtractor" - "$ref": "#/definitions/CustomRecordExtractor" download_target_extractor: - description: Responsible for fetching the information needed to download the completed job from the polling HTTP response. + description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job. anyOf: - "$ref": "#/definitions/DpathExtractor" - "$ref": "#/definitions/CustomRecordExtractor" From 708f2a25ce848a2c95e62b6d24687472243c00a6 Mon Sep 17 00:00:00 2001 From: David Gold <32782137+dbgold17@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:59:28 -0400 Subject: [PATCH 14/14] fix --- .../sources/declarative/models/declarative_component_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 2b71a97bd..e207e18f4 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2854,7 +2854,7 @@ class AsyncRetriever(BaseModel): ) download_target_extractor: Optional[Union[DpathExtractor, CustomRecordExtractor]] = Field( None, - description="Responsible for fetching the information needed to download the completed job from the polling HTTP response.", + description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.", ) download_extractor: Optional[ Union[DpathExtractor, CustomRecordExtractor, ResponseToFileExtractor]