Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@
T = TypeVar("T")


@lru_cache
def _cached_resolve_s3_region(bucket: str) -> str:
from pyarrow.fs import resolve_s3_region

return resolve_s3_region(bucket=bucket)


class UnsupportedPyArrowTypeException(Exception):
"""Cannot convert PyArrow type to corresponding Iceberg type."""

Expand Down Expand Up @@ -414,19 +421,19 @@ def _initialize_oss_fs(self) -> FileSystem:
return S3FileSystem(**client_kwargs)

def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem:
from pyarrow.fs import S3FileSystem, resolve_s3_region
from pyarrow.fs import S3FileSystem

# Resolve region from netloc(bucket), fallback to user-provided region
provided_region = get_first_property_value(self.properties, S3_REGION, AWS_REGION)

try:
bucket_region = resolve_s3_region(bucket=netloc)
bucket_region = _cached_resolve_s3_region(bucket=netloc)
except (OSError, TypeError):
bucket_region = None
logger.warning(f"Unable to resolve region for bucket {netloc}, using default region {provided_region}")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about moving try/except to _cached_resolve_s3_region and caching even the None value? Does the result change when we retry to resolve the same netloc? Or do we need to warn the user every time it fails to resolve the netloc?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @hussein-awala Thanks for jumping in here, and I really like your suggestion. Just pushed the suggestion, LMKWYT.

Or do we need to warn the user every time it fails to resolve the netloc?

I was surprised by hearing that folks are seeing multiple warnings, since Python only warns once by design:

Repetitions of a particular warning for the same source location are typically suppressed.


bucket_region = bucket_region or provided_region
if bucket_region != provided_region:
if provided_region is not None and bucket_region != provided_region:
logger.warning(
f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: "
f"provided region {provided_region}, actual region {bucket_region}"
Expand Down