From 5d370b58bce2aa72d323f0e53d60c575983bd3e9 Mon Sep 17 00:00:00 2001 From: Fokko Date: Tue, 6 Aug 2024 16:11:48 +0200 Subject: [PATCH 1/8] Rename `gcs.endpoint` to `gcs.service.host` To make it in line with Java: https://github.com/apache/iceberg/blob/6ee6d1327d3811dbd5795c4e87efdc41b7a58eaa/gcp/src/main/java/org/apache/iceberg/gcp/GCPProperties.java#L32 --- mkdocs/docs/configuration.md | 2 +- pyiceberg/io/__init__.py | 1 + pyiceberg/io/fsspec.py | 12 +++++++++++- pyiceberg/io/pyarrow.py | 10 +++++++++- tests/conftest.py | 6 +++--- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index ff3741656a..00a483c8d2 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -131,7 +131,7 @@ For the FileIO there are several configuration options available: | gcs.cache-timeout | 60 | Configure the cache expiration time in seconds for object metadata cache | | gcs.requester-pays | False | Configure whether to use requester-pays requests | | gcs.session-kwargs | {} | Configure a dict of parameters to pass on to aiohttp.ClientSession; can contain, for example, proxy settings. | -| gcs.endpoint | http://0.0.0.0:4443 | Configure an alternative endpoint for the GCS FileIO to access (format protocol://host:port) If not given, defaults to the value of environment variable "STORAGE_EMULATOR_HOST"; if that is not set either, will use the standard Google endpoint. | +| gcs.service.host | http://0.0.0.0:4443 | Configure an alternative endpoint for the GCS FileIO to access (format protocol://host:port) If not given, defaults to the value of environment variable "STORAGE_EMULATOR_HOST"; if that is not set either, will use the standard Google endpoint. | | gcs.default-location | US | Configure the default location where buckets are created, like 'US' or 'EUROPE-WEST3'. | | gcs.version-aware | False | Configure whether to support object versioning on the GCS bucket. | diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index d200874741..22aef520ab 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -78,6 +78,7 @@ GCS_REQUESTER_PAYS = "gcs.requester-pays" GCS_SESSION_KWARGS = "gcs.session-kwargs" GCS_ENDPOINT = "gcs.endpoint" +GCS_SERVICE_HOST = "gcs.service.host" GCS_DEFAULT_LOCATION = "gcs.default-bucket-location" GCS_VERSION_AWARE = "gcs.version-aware" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index d6e4a32add..b5bf7d2d7c 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -57,6 +57,7 @@ GCS_ENDPOINT, GCS_PROJECT_ID, GCS_REQUESTER_PAYS, + GCS_SERVICE_HOST, GCS_SESSION_KWARGS, GCS_TOKEN, GCS_VERSION_AWARE, @@ -76,6 +77,7 @@ OutputStream, ) from pyiceberg.typedef import Properties +from pyiceberg.utils.deprecated import deprecated from pyiceberg.utils.properties import get_first_property_value, property_as_bool logger = logging.getLogger(__name__) @@ -158,6 +160,14 @@ def _gs(properties: Properties) -> AbstractFileSystem: # https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem from gcsfs import GCSFileSystem + if (endpoint := properties.get(GCS_ENDPOINT)) and GCS_SERVICE_HOST not in properties: + deprecated( + deprecated_in="0.8.0", + removed_in="0.9.0", + help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", + )(lambda: None)() + properties[GCS_SERVICE_HOST] = endpoint + return GCSFileSystem( project=properties.get(GCS_PROJECT_ID), access=properties.get(GCS_ACCESS, "full_control"), @@ -166,7 +176,7 @@ def _gs(properties: Properties) -> AbstractFileSystem: cache_timeout=properties.get(GCS_CACHE_TIMEOUT), requester_pays=property_as_bool(properties, GCS_REQUESTER_PAYS, False), session_kwargs=json.loads(properties.get(GCS_SESSION_KWARGS, "{}")), - endpoint_url=properties.get(GCS_ENDPOINT), + endpoint_url=properties.get(GCS_SERVICE_HOST), default_location=properties.get(GCS_DEFAULT_LOCATION), version_aware=property_as_bool(properties, GCS_VERSION_AWARE, False), ) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index aefe86ac7a..1ab56490b3 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -89,6 +89,7 @@ AWS_SESSION_TOKEN, GCS_DEFAULT_LOCATION, GCS_ENDPOINT, + GCS_SERVICE_HOST, GCS_TOKEN, GCS_TOKEN_EXPIRES_AT_MS, HDFS_HOST, @@ -388,7 +389,14 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration)) if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION): gcs_kwargs["default_bucket_location"] = bucket_location - if endpoint := self.properties.get(GCS_ENDPOINT): + if (endpoint := self.properties.get(GCS_ENDPOINT)) and GCS_SERVICE_HOST not in self.properties: + deprecated( + deprecated_in="0.8.0", + removed_in="0.9.0", + help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", + )(lambda: None)() + self.properties[GCS_SERVICE_HOST] = endpoint + if endpoint := self.properties.get(GCS_SERVICE_HOST): url_parts = urlparse(endpoint) gcs_kwargs["scheme"] = url_parts.scheme gcs_kwargs["endpoint_override"] = url_parts.netloc diff --git a/tests/conftest.py b/tests/conftest.py index f65f4ed55f..ecd369ec46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,8 +53,8 @@ from pyiceberg.catalog.noop import NoopCatalog from pyiceberg.expressions import BoundReference from pyiceberg.io import ( - GCS_ENDPOINT, GCS_PROJECT_ID, + GCS_SERVICE_HOST, GCS_TOKEN, GCS_TOKEN_EXPIRES_AT_MS, fsspec, @@ -1873,7 +1873,7 @@ def fsspec_fileio(request: pytest.FixtureRequest) -> FsspecFileIO: @pytest.fixture def fsspec_fileio_gcs(request: pytest.FixtureRequest) -> FsspecFileIO: properties = { - GCS_ENDPOINT: request.config.getoption("--gcs.endpoint"), + GCS_SERVICE_HOST: request.config.getoption("--gcs.endpoint"), GCS_TOKEN: request.config.getoption("--gcs.oauth2.token"), GCS_PROJECT_ID: request.config.getoption("--gcs.project-id"), } @@ -1885,7 +1885,7 @@ def pyarrow_fileio_gcs(request: pytest.FixtureRequest) -> "PyArrowFileIO": from pyiceberg.io.pyarrow import PyArrowFileIO properties = { - GCS_ENDPOINT: request.config.getoption("--gcs.endpoint"), + GCS_SERVICE_HOST: request.config.getoption("--gcs.endpoint"), GCS_TOKEN: request.config.getoption("--gcs.oauth2.token"), GCS_PROJECT_ID: request.config.getoption("--gcs.project-id"), GCS_TOKEN_EXPIRES_AT_MS: datetime_to_millis(datetime.now()) + 60 * 1000, From 7157be37e41f3ebd921cee651de6407a62255a5c Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 12 Aug 2024 11:55:41 +0200 Subject: [PATCH 2/8] Import Co-authored-by: Andre Luis Anastacio --- pyiceberg/io/fsspec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index b5bf7d2d7c..15e9784438 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -77,7 +77,7 @@ OutputStream, ) from pyiceberg.typedef import Properties -from pyiceberg.utils.deprecated import deprecated +from pyiceberg.utils.deprecated import deprecation_message from pyiceberg.utils.properties import get_first_property_value, property_as_bool logger = logging.getLogger(__name__) From 03d69caa47804c72769833e100445eff5257cdd6 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 12 Aug 2024 11:55:53 +0200 Subject: [PATCH 3/8] Use `deprecation_message` instead Co-authored-by: Andre Luis Anastacio --- pyiceberg/io/fsspec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 15e9784438..75976dbda7 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -161,11 +161,11 @@ def _gs(properties: Properties) -> AbstractFileSystem: from gcsfs import GCSFileSystem if (endpoint := properties.get(GCS_ENDPOINT)) and GCS_SERVICE_HOST not in properties: - deprecated( + deprecation_message( deprecated_in="0.8.0", removed_in="0.9.0", help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", - )(lambda: None)() + ) properties[GCS_SERVICE_HOST] = endpoint return GCSFileSystem( From 204dac2b37a1456f8d9e24797539e5fe446e6f3e Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 12 Aug 2024 11:56:00 +0200 Subject: [PATCH 4/8] Use `deprecation_message` instead Co-authored-by: Andre Luis Anastacio --- pyiceberg/io/pyarrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 1ab56490b3..f2ac8deb2b 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -390,11 +390,11 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION): gcs_kwargs["default_bucket_location"] = bucket_location if (endpoint := self.properties.get(GCS_ENDPOINT)) and GCS_SERVICE_HOST not in self.properties: - deprecated( + deprecation_message( deprecated_in="0.8.0", removed_in="0.9.0", help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", - )(lambda: None)() + ) self.properties[GCS_SERVICE_HOST] = endpoint if endpoint := self.properties.get(GCS_SERVICE_HOST): url_parts = urlparse(endpoint) From d48040417d3fbf4c735d9811bb19fe846c8cdea3 Mon Sep 17 00:00:00 2001 From: Fokko Date: Tue, 5 Nov 2024 17:06:05 +0100 Subject: [PATCH 5/8] Fix message --- pyiceberg/io/pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 4ad1a6c090..b331728a07 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -160,7 +160,7 @@ from pyiceberg.utils.concurrent import ExecutorFactory from pyiceberg.utils.config import Config from pyiceberg.utils.datetime import millis_to_datetime -from pyiceberg.utils.deprecated import deprecated +from pyiceberg.utils.deprecated import deprecated, deprecation_message from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int from pyiceberg.utils.singleton import Singleton from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string From d9b56e77fa84ed91aba170732c9d72c1b2aed67c Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 5 Nov 2024 22:23:44 +0100 Subject: [PATCH 6/8] Update pyiceberg/io/fsspec.py Co-authored-by: Kevin Liu --- pyiceberg/io/fsspec.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index bdae2ca88c..1f3caa9a66 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -172,14 +172,12 @@ def _gs(properties: Properties) -> AbstractFileSystem: # https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem from gcsfs import GCSFileSystem - if (endpoint := properties.get(GCS_ENDPOINT)) and GCS_SERVICE_HOST not in properties: + if properties.get(GCS_ENDPOINT): deprecation_message( deprecated_in="0.8.0", removed_in="0.9.0", help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", ) - properties[GCS_SERVICE_HOST] = endpoint - return GCSFileSystem( project=properties.get(GCS_PROJECT_ID), access=properties.get(GCS_ACCESS, "full_control"), From 04e8ff36f8fcc40e8c2012d80bfc230b93c324b7 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 5 Nov 2024 22:23:50 +0100 Subject: [PATCH 7/8] Update pyiceberg/io/fsspec.py Co-authored-by: Kevin Liu --- pyiceberg/io/fsspec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 1f3caa9a66..434ae67df0 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -186,7 +186,7 @@ def _gs(properties: Properties) -> AbstractFileSystem: cache_timeout=properties.get(GCS_CACHE_TIMEOUT), requester_pays=property_as_bool(properties, GCS_REQUESTER_PAYS, False), session_kwargs=json.loads(properties.get(GCS_SESSION_KWARGS, "{}")), - endpoint_url=properties.get(GCS_SERVICE_HOST), + endpoint_url=get_first_property_value(properties, GCS_SERVICE_HOST, GCS_ENDPOINT), default_location=properties.get(GCS_DEFAULT_LOCATION), version_aware=property_as_bool(properties, GCS_VERSION_AWARE, False), ) From ef0be4546a91ae4c54ae139b03402fce4f8dca63 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 6 Nov 2024 09:13:38 +0100 Subject: [PATCH 8/8] Update pyiceberg/io/pyarrow.py Co-authored-by: Kevin Liu --- pyiceberg/io/pyarrow.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index b331728a07..f221266eb5 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -391,14 +391,13 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration)) if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION): gcs_kwargs["default_bucket_location"] = bucket_location - if (endpoint := self.properties.get(GCS_ENDPOINT)) and GCS_SERVICE_HOST not in self.properties: - deprecation_message( - deprecated_in="0.8.0", - removed_in="0.9.0", - help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", - ) - self.properties[GCS_SERVICE_HOST] = endpoint - if endpoint := self.properties.get(GCS_SERVICE_HOST): + if endpoint := get_first_property_value(self.properties, GCS_SERVICE_HOST, GCS_ENDPOINT): + if self.properties.get(GCS_ENDPOINT): + deprecation_message( + deprecated_in="0.8.0", + removed_in="0.9.0", + help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", + ) url_parts = urlparse(endpoint) gcs_kwargs["scheme"] = url_parts.scheme gcs_kwargs["endpoint_override"] = url_parts.netloc