Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions gcsfs/extended_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def __init__(self, *args, finalize_on_close=False, **kwargs):
finalize_on_close : bool, default False
By default, files in zonal buckets are left unfinalized to allow appends.
**kwargs : dict
- cache_unknown_buckets : bool, default False
Whether to cache UNKNOWN bucket types. Useful when users lack permissions
for the Storage Control API to avoid repeated slow failing lookups.
Additional arguments passed to GCSFileSystem.
Supports retry configuration overrides for Storage Control API:
- retry_timeout: Total time to spend retrying (seconds).
Expand All @@ -104,6 +107,7 @@ def __init__(self, *args, finalize_on_close=False, **kwargs):
- retry_multiplier: Multiplier for delay between retries.
These map to `google.api_core.retry.AsyncRetry` arguments (without 'retry_' prefix).
"""
self._cache_unknown_buckets = kwargs.pop("cache_unknown_buckets", False)
Copy link
Copy Markdown
Collaborator

@zhixiangli zhixiangli May 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a user sets _cache_unknown_buckets to true and has the required permissions, is the result still cached when a transient error occurs? Could we check if it is a permission error / cache with a TTL, allowing for access to be granted later? WDYT?

valid_keys = DEFAULT_RETRY_CONFIG.keys()
self.retry_config = {
k[6:]: v
Expand Down Expand Up @@ -194,8 +198,7 @@ async def _lookup_bucket_type(self, bucket):
if bucket in self._storage_layout_cache:
return self._storage_layout_cache[bucket]
bucket_type = await self._get_bucket_type(bucket)
# Dont cache UNKNOWN type
if bucket_type == BucketType.UNKNOWN:
if bucket_type == BucketType.UNKNOWN and not self._cache_unknown_buckets:
return bucket_type
self._storage_layout_cache[bucket] = bucket_type
return self._storage_layout_cache[bucket]
Expand Down
52 changes: 52 additions & 0 deletions gcsfs/tests/test_extended_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1611,3 +1611,55 @@ async def test_cat_file_non_zonal_fallback(extended_gcsfs):
mock_super_cat.assert_awaited_once_with(
"standard_bucket/obj", start=10, end=20, concurrency=2, custom_arg="val"
)


@pytest.mark.asyncio
async def test_lookup_bucket_type_not_cached_unknown(extended_gcsfs):
"""Test that BucketType.UNKNOWN is not cached when _cache_unknown_buckets is False."""
fs = extended_gcsfs
fs._cache_unknown_buckets = False

# Clear cache just in case
fs._storage_layout_cache.clear()

# Mock _get_bucket_type to return UNKNOWN
with mock.patch.object(
fs, "_get_bucket_type", new_callable=mock.AsyncMock
) as mock_get_type:
mock_get_type.return_value = BucketType.UNKNOWN

# First lookup
type1 = await fs._lookup_bucket_type("my-bucket")
assert type1 == BucketType.UNKNOWN
assert mock_get_type.call_count == 1

# Second lookup should call _get_bucket_type again because it's not cached
type2 = await fs._lookup_bucket_type("my-bucket")
assert type2 == BucketType.UNKNOWN
assert mock_get_type.call_count == 2


@pytest.mark.asyncio
async def test_lookup_bucket_type_cached_unknown(extended_gcsfs):
"""Test that BucketType.UNKNOWN is cached when _cache_unknown_buckets is True."""
fs = extended_gcsfs
fs._cache_unknown_buckets = True

# Clear cache just in case
fs._storage_layout_cache.clear()

# Mock _get_bucket_type to return UNKNOWN
with mock.patch.object(
fs, "_get_bucket_type", new_callable=mock.AsyncMock
) as mock_get_type:
mock_get_type.return_value = BucketType.UNKNOWN

# First lookup
type1 = await fs._lookup_bucket_type("my-bucket")
assert type1 == BucketType.UNKNOWN
assert mock_get_type.call_count == 1

# Second lookup should NOT call _get_bucket_type again because it's cached
type2 = await fs._lookup_bucket_type("my-bucket")
assert type2 == BucketType.UNKNOWN
assert mock_get_type.call_count == 1
Loading