diff --git a/README.md b/README.md index f58d5e35..291f01c3 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,14 @@ Within the activated Python environment, use the following command to install go pip install gokart ``` +If you use S3 or GCS as a data store, install the corresponding extras: + +```bash +pip install gokart[s3] # S3 support +pip install gokart[gcs] # GCS support +pip install gokart[all] # both S3 and GCS +``` + # Quickstart diff --git a/docs/intro_to_gokart.rst b/docs/intro_to_gokart.rst index d5667486..4990a9c2 100644 --- a/docs/intro_to_gokart.rst +++ b/docs/intro_to_gokart.rst @@ -11,6 +11,13 @@ Within the activated Python environment, use the following command to install go pip install gokart +If you use S3 or GCS as a data store, install the corresponding extras: + +.. code:: sh + + pip install gokart[s3] # S3 support + pip install gokart[gcs] # GCS support + pip install gokart[all] # both S3 and GCS Quickstart diff --git a/docs/task_settings.rst b/docs/task_settings.rst index 247d21a8..aada0c45 100644 --- a/docs/task_settings.rst +++ b/docs/task_settings.rst @@ -25,9 +25,15 @@ It is recommended to use the config file since it does not change much. gokart.add_config('base.ini') -To use the S3 or GCS repository, please set the bucket path as ``s3://{YOUR_REPOSITORY_NAME}`` or ``gs://{YOUR_REPOSITORY_NAME}`` respectively. +To use the S3 or GCS repository, please install the corresponding extras and set the bucket path as ``s3://{YOUR_REPOSITORY_NAME}`` or ``gs://{YOUR_REPOSITORY_NAME}`` respectively. -If use S3 or GCS, please set credential information to Environment Variables. +.. code:: sh + + pip install gokart[s3] # for S3 support + pip install gokart[gcs] # for GCS support + pip install gokart[all] # for both S3 and GCS + +Also, please set credential information to Environment Variables. .. code:: sh diff --git a/gokart/gcs_config.py b/gokart/gcs_config.py index f95f0287..47c2ecb6 100644 --- a/gokart/gcs_config.py +++ b/gokart/gcs_config.py @@ -2,11 +2,13 @@ import json import os -from typing import cast +from typing import TYPE_CHECKING, cast import luigi -import luigi.contrib.gcs -from google.oauth2.service_account import Credentials + +if TYPE_CHECKING: + import luigi.contrib.gcs + from google.oauth2.service_account import Credentials class GCSConfig(luigi.Config): @@ -19,9 +21,20 @@ def get_gcs_client(self) -> luigi.contrib.gcs.GCSClient: return self._client def _get_gcs_client(self) -> luigi.contrib.gcs.GCSClient: + try: + import googleapiclient # noqa: F401 + except ImportError: + raise ImportError('GCS support requires additional dependencies. Install them with: pip install gokart[gcs]') from None + import luigi.contrib.gcs + return luigi.contrib.gcs.GCSClient(oauth_credentials=self._load_oauth_credentials()) def _load_oauth_credentials(self) -> Credentials | None: + try: + from google.oauth2.service_account import Credentials + except ImportError: + raise ImportError('GCS support requires additional dependencies. Install them with: pip install gokart[gcs]') from None + json_str = os.environ.get(self.gcs_credential_name) if not json_str: return None diff --git a/gokart/gcs_obj_metadata_client.py b/gokart/gcs_obj_metadata_client.py index debbdc3f..2e356308 100644 --- a/gokart/gcs_obj_metadata_client.py +++ b/gokart/gcs_obj_metadata_client.py @@ -9,8 +9,6 @@ from typing import Any, Final from urllib.parse import urlsplit -from googleapiclient.model import makepatch - from gokart.gcs_config import GCSConfig from gokart.required_task_output import RequiredTaskOutput from gokart.utils import FlattenableItems @@ -39,6 +37,14 @@ def _path_to_bucket_and_key(path: str) -> tuple[str, str]: path_without_initial_slash = path[1:] return netloc, path_without_initial_slash + @staticmethod + def _makepatch(original: dict[str, Any], modified: dict[str, Any]) -> dict[str, Any]: + try: + from googleapiclient.model import makepatch + except ImportError: + raise ImportError('GCS support requires additional dependencies. Install them with: pip install gokart[gcs]') from None + return dict(makepatch(original, modified)) + @staticmethod def add_task_state_labels( path: str, @@ -78,7 +84,7 @@ def add_task_state_labels( .patch( bucket=bucket, object=obj, - body=makepatch({'metadata': original_metadata}, {'metadata': patched_metadata}), + body=GCSObjectMetadataClient._makepatch({'metadata': original_metadata}, {'metadata': patched_metadata}), ) .execute() ) diff --git a/gokart/gcs_zip_client.py b/gokart/gcs_zip_client.py index 8c26690a..43d7c391 100644 --- a/gokart/gcs_zip_client.py +++ b/gokart/gcs_zip_client.py @@ -4,12 +4,13 @@ import shutil from typing import cast -from gokart.gcs_config import GCSConfig from gokart.zip_client import ZipClient, _unzip_file class GCSZipClient(ZipClient): def __init__(self, file_path: str, temporary_directory: str) -> None: + from gokart.gcs_config import GCSConfig + self._file_path = file_path self._temporary_directory = temporary_directory self._client = GCSConfig().get_gcs_client() diff --git a/gokart/object_storage.py b/gokart/object_storage.py index 1cbf9e32..b7ddcd69 100644 --- a/gokart/object_storage.py +++ b/gokart/object_storage.py @@ -4,14 +4,8 @@ from typing import cast import luigi -import luigi.contrib.gcs -import luigi.contrib.s3 from luigi.format import Format -from gokart.gcs_config import GCSConfig -from gokart.gcs_zip_client import GCSZipClient -from gokart.s3_config import S3Config -from gokart.s3_zip_client import S3ZipClient from gokart.zip_client import ZipClient object_storage_path_prefix = ['s3://', 'gs://'] @@ -28,8 +22,26 @@ def if_object_storage_path(path: str) -> bool: @staticmethod def get_object_storage_target(path: str, format: Format) -> luigi.target.FileSystemTarget: if path.startswith('s3://'): + try: + import boto3 # noqa: F401 + except ImportError: + raise ImportError('S3 support requires additional dependencies. Install them with: pip install gokart[s3]') from None + + import luigi.contrib.s3 + + from gokart.s3_config import S3Config + return luigi.contrib.s3.S3Target(path, client=S3Config().get_s3_client(), format=format) elif path.startswith('gs://'): + try: + import googleapiclient # noqa: F401 + except ImportError: + raise ImportError('GCS support requires additional dependencies. Install them with: pip install gokart[gcs]') from None + + import luigi.contrib.gcs + + from gokart.gcs_config import GCSConfig + return luigi.contrib.gcs.GCSTarget(path, client=GCSConfig().get_gcs_client(), format=format) else: raise @@ -37,8 +49,12 @@ def get_object_storage_target(path: str, format: Format) -> luigi.target.FileSys @staticmethod def exists(path: str) -> bool: if path.startswith('s3://'): + from gokart.s3_config import S3Config + return cast(bool, S3Config().get_s3_client().exists(path)) elif path.startswith('gs://'): + from gokart.gcs_config import GCSConfig + return cast(bool, GCSConfig().get_gcs_client().exists(path)) else: raise @@ -46,8 +62,12 @@ def exists(path: str) -> bool: @staticmethod def get_timestamp(path: str) -> datetime: if path.startswith('s3://'): + from gokart.s3_config import S3Config + return cast(datetime, S3Config().get_s3_client().get_key(path).last_modified) elif path.startswith('gs://'): + from gokart.gcs_config import GCSConfig + # for gcs object # should PR to luigi bucket, obj = GCSConfig().get_gcs_client()._path_to_bucket_and_key(path) @@ -59,12 +79,22 @@ def get_timestamp(path: str) -> datetime: @staticmethod def get_zip_client(file_path: str, temporary_directory: str) -> ZipClient: if file_path.startswith('s3://'): + from gokart.s3_zip_client import S3ZipClient + return S3ZipClient(file_path=file_path, temporary_directory=temporary_directory) elif file_path.startswith('gs://'): + from gokart.gcs_zip_client import GCSZipClient + return GCSZipClient(file_path=file_path, temporary_directory=temporary_directory) else: raise @staticmethod def is_buffered_reader(file: object) -> bool: + try: + import boto3 # noqa: F401 + except ImportError: + return True + import luigi.contrib.s3 + return not isinstance(file, luigi.contrib.s3.ReadableS3File) diff --git a/gokart/s3_config.py b/gokart/s3_config.py index fde845e2..84cd136b 100644 --- a/gokart/s3_config.py +++ b/gokart/s3_config.py @@ -1,9 +1,12 @@ from __future__ import annotations import os +from typing import TYPE_CHECKING import luigi -import luigi.contrib.s3 + +if TYPE_CHECKING: + import luigi.contrib.s3 class S3Config(luigi.Config): @@ -18,6 +21,12 @@ def get_s3_client(self) -> luigi.contrib.s3.S3Client: return self._client def _get_s3_client(self) -> luigi.contrib.s3.S3Client: + try: + import boto3 # noqa: F401 + except ImportError: + raise ImportError('S3 support requires additional dependencies. Install them with: pip install gokart[s3]') from None + import luigi.contrib.s3 + return luigi.contrib.s3.S3Client( aws_access_key_id=os.environ.get(self.aws_access_key_id_name), aws_secret_access_key=os.environ.get(self.aws_secret_access_key_name) ) diff --git a/gokart/s3_zip_client.py b/gokart/s3_zip_client.py index fddcfc3d..57519d6c 100644 --- a/gokart/s3_zip_client.py +++ b/gokart/s3_zip_client.py @@ -4,12 +4,13 @@ import shutil from typing import cast -from gokart.s3_config import S3Config from gokart.zip_client import ZipClient, _unzip_file class S3ZipClient(ZipClient): def __init__(self, file_path: str, temporary_directory: str) -> None: + from gokart.s3_config import S3Config + self._file_path = file_path self._temporary_directory = temporary_directory self._client = S3Config().get_s3_client() diff --git a/pyproject.toml b/pyproject.toml index 4a35f375..1cb3ce4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,13 +13,10 @@ readme = "README.md" requires-python = ">=3.10, <4" dependencies = [ "luigi>=3.8.0", - "boto3", "slack-sdk", "pandas", "numpy", - "google-auth", "pyarrow", - "google-api-python-client", "APScheduler", "redis", "dill", @@ -39,6 +36,9 @@ classifiers = [ dynamic = ["version"] [project.optional-dependencies] +s3 = ["boto3"] +gcs = ["google-auth", "google-api-python-client"] +all = ["gokart[s3]", "gokart[gcs]"] polars = ["polars>=0.19.0"] [project.urls] @@ -48,6 +48,9 @@ Documentation = "https://gokart.readthedocs.io/en/latest/" [dependency-groups] test = [ + "boto3", + "google-auth", + "google-api-python-client", "fakeredis", "lupa", "matplotlib", diff --git a/test/test_extras.py b/test/test_extras.py new file mode 100644 index 00000000..998a2fb4 --- /dev/null +++ b/test/test_extras.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +import sys +from collections.abc import Callable +from typing import Any +from unittest.mock import patch + +import luigi.format +import pytest + + +def _make_import_raiser(blocked_modules: set[str]) -> Callable[..., Any]: + """Return a side_effect for builtins.__import__ that raises ImportError for blocked modules.""" + original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__ + + def _import_raiser(name, *args, **kwargs): + for blocked in blocked_modules: + if name == blocked or name.startswith(blocked + '.'): + raise ImportError(f'No module named {name!r}') + return original_import(name, *args, **kwargs) + + return _import_raiser + + +class TestS3ExtrasNotInstalled: + def test_s3_config_raises_import_error_without_boto3(self): + from gokart.s3_config import S3Config + + config = S3Config() + config._client = None + + with patch('builtins.__import__', side_effect=_make_import_raiser({'boto3'})): + with pytest.raises(ImportError, match=r'pip install gokart\[s3\]'): + config._get_s3_client() + + def test_object_storage_get_target_raises_for_s3_without_extras(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'boto3'})): + with pytest.raises(ImportError, match=r'pip install gokart\[s3\]'): + ObjectStorage.get_object_storage_target('s3://bucket/key', format=luigi.format.Nop) + + def test_object_storage_exists_raises_for_s3_without_extras(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'boto3'})): + with pytest.raises(ImportError, match=r'pip install gokart\[s3\]'): + ObjectStorage.exists('s3://bucket/key') + + def test_object_storage_get_zip_client_raises_for_s3_without_extras(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'boto3'})): + with pytest.raises(ImportError, match=r'pip install gokart\[s3\]'): + ObjectStorage.get_zip_client('s3://bucket/key.zip', '/tmp/test') + + def test_s3_zip_client_raises_without_extras(self): + from gokart.s3_zip_client import S3ZipClient + + with patch('builtins.__import__', side_effect=_make_import_raiser({'boto3'})): + with pytest.raises(ImportError, match=r'pip install gokart\[s3\]'): + S3ZipClient(file_path='s3://bucket/key.zip', temporary_directory='/tmp/test') + + def test_object_storage_is_buffered_reader_returns_true_without_s3(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'boto3'})): + # boto3 module is already cached in sys.modules, so we also need to remove it + saved = sys.modules.pop('boto3', None) + try: + assert ObjectStorage.is_buffered_reader(object()) is True + finally: + if saved is not None: + sys.modules['boto3'] = saved + + +class TestGCSExtrasNotInstalled: + def test_gcs_config_raises_import_error_without_gcs_lib(self): + from gokart.gcs_config import GCSConfig + + config = GCSConfig() + config._client = None + + with patch('builtins.__import__', side_effect=_make_import_raiser({'googleapiclient'})): + saved = sys.modules.pop('googleapiclient', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + config._get_gcs_client() + finally: + if saved is not None: + sys.modules['googleapiclient'] = saved + + def test_gcs_config_load_credentials_raises_without_google_auth(self): + from gokart.gcs_config import GCSConfig + + config = GCSConfig() + + with patch('builtins.__import__', side_effect=_make_import_raiser({'google.oauth2'})): + # Need to remove cached module + saved = sys.modules.pop('google.oauth2.service_account', None) + saved_parent = sys.modules.pop('google.oauth2', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + config._load_oauth_credentials() + finally: + if saved is not None: + sys.modules['google.oauth2.service_account'] = saved + if saved_parent is not None: + sys.modules['google.oauth2'] = saved_parent + + def test_gcs_obj_metadata_client_makepatch_raises_without_googleapiclient(self): + from gokart.gcs_obj_metadata_client import GCSObjectMetadataClient + + with patch('builtins.__import__', side_effect=_make_import_raiser({'googleapiclient'})): + saved = sys.modules.pop('googleapiclient.model', None) + saved_parent = sys.modules.pop('googleapiclient', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + GCSObjectMetadataClient._makepatch({}, {}) + finally: + if saved is not None: + sys.modules['googleapiclient.model'] = saved + if saved_parent is not None: + sys.modules['googleapiclient'] = saved_parent + + def test_object_storage_exists_raises_for_gcs_without_extras(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'googleapiclient'})): + saved = sys.modules.pop('googleapiclient', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + ObjectStorage.exists('gs://bucket/key') + finally: + if saved is not None: + sys.modules['googleapiclient'] = saved + + def test_object_storage_get_zip_client_raises_for_gcs_without_extras(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'googleapiclient'})): + saved = sys.modules.pop('googleapiclient', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + ObjectStorage.get_zip_client('gs://bucket/key.zip', '/tmp/test') + finally: + if saved is not None: + sys.modules['googleapiclient'] = saved + + def test_gcs_zip_client_raises_without_extras(self): + from gokart.gcs_zip_client import GCSZipClient + + with patch('builtins.__import__', side_effect=_make_import_raiser({'googleapiclient'})): + saved = sys.modules.pop('googleapiclient', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + GCSZipClient(file_path='gs://bucket/key.zip', temporary_directory='/tmp/test') + finally: + if saved is not None: + sys.modules['googleapiclient'] = saved + + def test_object_storage_get_target_raises_for_gcs_without_extras(self): + from gokart.object_storage import ObjectStorage + + with patch('builtins.__import__', side_effect=_make_import_raiser({'googleapiclient'})): + saved = sys.modules.pop('googleapiclient', None) + try: + with pytest.raises(ImportError, match=r'pip install gokart\[gcs\]'): + ObjectStorage.get_object_storage_target('gs://bucket/key', format=luigi.format.Nop) + finally: + if saved is not None: + sys.modules['googleapiclient'] = saved + + +class TestExtrasInstalled: + """Verify that imports succeed when extras are installed (current test environment).""" + + def test_s3_config_can_be_imported(self): + from gokart.s3_config import S3Config + + assert S3Config is not None + + def test_gcs_config_can_be_imported(self): + from gokart.gcs_config import GCSConfig + + assert GCSConfig is not None + + def test_object_storage_can_be_imported(self): + from gokart.object_storage import ObjectStorage + + assert ObjectStorage is not None + + def test_gcs_obj_metadata_client_can_be_imported(self): + from gokart.gcs_obj_metadata_client import GCSObjectMetadataClient + + assert GCSObjectMetadataClient is not None + + def test_luigi_contrib_s3_is_importable(self): + import luigi.contrib.s3 + + assert luigi.contrib.s3 is not None + + def test_luigi_contrib_gcs_is_importable(self): + import luigi.contrib.gcs + + assert luigi.contrib.gcs is not None diff --git a/uv.lock b/uv.lock index 218b1141..8675d135 100644 --- a/uv.lock +++ b/uv.lock @@ -550,10 +550,7 @@ source = { editable = "." } dependencies = [ { name = "apscheduler" }, { name = "backoff" }, - { name = "boto3" }, { name = "dill" }, - { name = "google-api-python-client" }, - { name = "google-auth" }, { name = "luigi" }, { name = "numpy" }, { name = "pandas" }, @@ -565,9 +562,21 @@ dependencies = [ ] [package.optional-dependencies] +all = [ + { name = "boto3" }, + { name = "google-api-python-client" }, + { name = "google-auth" }, +] +gcs = [ + { name = "google-api-python-client" }, + { name = "google-auth" }, +] polars = [ { name = "polars" }, ] +s3 = [ + { name = "boto3" }, +] [package.dev-dependencies] lint = [ @@ -575,7 +584,10 @@ lint = [ { name = "ruff" }, ] test = [ + { name = "boto3" }, { name = "fakeredis" }, + { name = "google-api-python-client" }, + { name = "google-auth" }, { name = "lupa" }, { name = "matplotlib" }, { name = "moto" }, @@ -595,10 +607,13 @@ test = [ requires-dist = [ { name = "apscheduler" }, { name = "backoff" }, - { name = "boto3" }, + { name = "boto3", marker = "extra == 'all'" }, + { name = "boto3", marker = "extra == 's3'" }, { name = "dill" }, - { name = "google-api-python-client" }, - { name = "google-auth" }, + { name = "google-api-python-client", marker = "extra == 'all'" }, + { name = "google-api-python-client", marker = "extra == 'gcs'" }, + { name = "google-auth", marker = "extra == 'all'" }, + { name = "google-auth", marker = "extra == 'gcs'" }, { name = "luigi", specifier = ">=3.8.0" }, { name = "numpy" }, { name = "pandas" }, @@ -608,7 +623,7 @@ requires-dist = [ { name = "slack-sdk" }, { name = "typing-extensions", marker = "python_full_version < '3.13'", specifier = ">=4.11.0" }, ] -provides-extras = ["polars"] +provides-extras = ["all", "gcs", "polars", "s3"] [package.metadata.requires-dev] lint = [ @@ -616,7 +631,10 @@ lint = [ { name = "ruff" }, ] test = [ + { name = "boto3" }, { name = "fakeredis" }, + { name = "google-api-python-client" }, + { name = "google-auth" }, { name = "lupa" }, { name = "matplotlib" }, { name = "moto", specifier = ">=4.0" },