Skip to content

Commit 1afff77

Browse files
author
JannicCutura
committed
fix: preserve original path for non-S3 schemes in access point resolution
The _resolve_s3_access_point method was incorrectly constructing paths for non-S3 schemes (like local files) by concatenating netloc and path_suffix. This caused issues when local paths had double slashes (e.g., //tmp/...) because urlparse interprets these as network paths with netloc. Now the method takes the original path from parse_location as a parameter and returns it unchanged for non-S3 schemes, ensuring local file operations work correctly.
1 parent dcff1ac commit 1afff77

File tree

2 files changed

+22
-17
lines changed

2 files changed

+22
-17
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ def _initialize_gcs_fs(self) -> FileSystem:
589589
def _initialize_local_fs(self) -> FileSystem:
590590
return PyArrowLocalFileSystem()
591591

592-
def _resolve_s3_access_point(self, scheme: str, netloc: str, path_suffix: str) -> tuple[str, str]:
592+
def _resolve_s3_access_point(self, scheme: str, netloc: str, path_suffix: str, original_path: str) -> tuple[str, str]:
593593
"""Resolve S3 access point alias for a bucket if configured.
594594
595595
For cross-account access, S3 paths need to use access point aliases instead of bucket names.
@@ -600,12 +600,13 @@ def _resolve_s3_access_point(self, scheme: str, netloc: str, path_suffix: str) -
600600
scheme: The URI scheme (s3, s3a, s3n)
601601
netloc: The bucket name from the original URI
602602
path_suffix: The path within the bucket (without bucket name)
603+
original_path: The original path from parse_location (fallback for non-S3)
603604
604605
Returns:
605606
Tuple of (resolved_netloc, resolved_path) where netloc may be replaced with access point alias
606607
"""
607608
if scheme not in {"s3", "s3a", "s3n"}:
608-
return netloc, f"{netloc}{path_suffix}"
609+
return netloc, original_path
609610

610611
# Check for access point alias configuration for this bucket
611612
access_point_key = f"{S3_ACCESS_POINT_PREFIX}{netloc}"
@@ -614,7 +615,7 @@ def _resolve_s3_access_point(self, scheme: str, netloc: str, path_suffix: str) -
614615
# Replace bucket with access point alias in the path
615616
return access_point_alias, f"{access_point_alias}{path_suffix}"
616617

617-
return netloc, f"{netloc}{path_suffix}"
618+
return netloc, original_path
618619

619620
def new_input(self, location: str) -> PyArrowFile:
620621
"""Get a PyArrowFile instance to read bytes from the file at the given location.
@@ -625,10 +626,10 @@ def new_input(self, location: str) -> PyArrowFile:
625626
Returns:
626627
PyArrowFile: A PyArrowFile instance for the given location.
627628
"""
628-
scheme, netloc, _ = self.parse_location(location, self.properties)
629-
# For S3, resolve access point ARN if configured
629+
scheme, netloc, path = self.parse_location(location, self.properties)
630+
# For S3, resolve access point if configured
630631
uri = urlparse(location)
631-
resolved_netloc, resolved_path = self._resolve_s3_access_point(scheme, netloc, uri.path)
632+
resolved_netloc, resolved_path = self._resolve_s3_access_point(scheme, netloc, uri.path, path)
632633
return PyArrowFile(
633634
fs=self.fs_by_scheme(scheme, resolved_netloc),
634635
location=location,
@@ -645,10 +646,10 @@ def new_output(self, location: str) -> PyArrowFile:
645646
Returns:
646647
PyArrowFile: A PyArrowFile instance for the given location.
647648
"""
648-
scheme, netloc, _ = self.parse_location(location, self.properties)
649-
# For S3, resolve access point ARN if configured
649+
scheme, netloc, path = self.parse_location(location, self.properties)
650+
# For S3, resolve access point if configured
650651
uri = urlparse(location)
651-
resolved_netloc, resolved_path = self._resolve_s3_access_point(scheme, netloc, uri.path)
652+
resolved_netloc, resolved_path = self._resolve_s3_access_point(scheme, netloc, uri.path, path)
652653
return PyArrowFile(
653654
fs=self.fs_by_scheme(scheme, resolved_netloc),
654655
location=location,
@@ -670,10 +671,10 @@ def delete(self, location: str | InputFile | OutputFile) -> None:
670671
an AWS error code 15.
671672
"""
672673
str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location
673-
scheme, netloc, _ = self.parse_location(str_location, self.properties)
674-
# For S3, resolve access point ARN if configured
674+
scheme, netloc, path = self.parse_location(str_location, self.properties)
675+
# For S3, resolve access point if configured
675676
uri = urlparse(str_location)
676-
resolved_netloc, resolved_path = self._resolve_s3_access_point(scheme, netloc, uri.path)
677+
resolved_netloc, resolved_path = self._resolve_s3_access_point(scheme, netloc, uri.path, path)
677678
fs = self.fs_by_scheme(scheme, resolved_netloc)
678679

679680
try:

tests/io/test_pyarrow.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,8 @@ def test_s3_access_point_resolution_with_config() -> None:
458458
fileio = PyArrowFileIO(properties=properties)
459459

460460
# Test _resolve_s3_access_point directly
461-
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("s3", bucket_name, "/path/to/file.parquet")
461+
original_path = f"{bucket_name}/path/to/file.parquet"
462+
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("s3", bucket_name, "/path/to/file.parquet", original_path)
462463

463464
assert resolved_netloc == access_point_alias
464465
assert resolved_path == f"{access_point_alias}/path/to/file.parquet"
@@ -469,7 +470,8 @@ def test_s3_access_point_resolution_without_config() -> None:
469470
bucket_name = "my-bucket"
470471
fileio = PyArrowFileIO(properties={})
471472

472-
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("s3", bucket_name, "/path/to/file.parquet")
473+
original_path = f"{bucket_name}/path/to/file.parquet"
474+
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("s3", bucket_name, "/path/to/file.parquet", original_path)
473475

474476
assert resolved_netloc == bucket_name
475477
assert resolved_path == f"{bucket_name}/path/to/file.parquet"
@@ -486,10 +488,11 @@ def test_s3_access_point_resolution_non_s3_scheme() -> None:
486488
fileio = PyArrowFileIO(properties=properties)
487489

488490
# Test with non-S3 scheme (should not resolve)
489-
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("gs", bucket_name, "/path/to/file.parquet")
491+
original_path = f"{bucket_name}/path/to/file.parquet"
492+
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("gs", bucket_name, "/path/to/file.parquet", original_path)
490493

491494
assert resolved_netloc == bucket_name
492-
assert resolved_path == f"{bucket_name}/path/to/file.parquet"
495+
assert resolved_path == original_path
493496

494497

495498
def test_s3_access_point_resolution_s3a_scheme() -> None:
@@ -502,7 +505,8 @@ def test_s3_access_point_resolution_s3a_scheme() -> None:
502505

503506
fileio = PyArrowFileIO(properties=properties)
504507

505-
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("s3a", bucket_name, "/path/to/file.parquet")
508+
original_path = f"{bucket_name}/path/to/file.parquet"
509+
resolved_netloc, resolved_path = fileio._resolve_s3_access_point("s3a", bucket_name, "/path/to/file.parquet", original_path)
506510

507511
assert resolved_netloc == access_point_alias
508512
assert resolved_path == f"{access_point_alias}/path/to/file.parquet"

0 commit comments

Comments
 (0)