Skip to content

Commit 0cac092

Browse files
authored
fix: validate storage name and alias values (#1950)
### Summary Storage aliases were not validated the same way storage names are. This change: - Adds validation for storage **aliases** (previously unvalidated), allowing the same simple identifier characters used for names plus the reserved `__default__` alias. - Ensures a storage name or alias always maps to a single subdirectory inside the configured storage directory, both via the high-level API and when the file-system storage clients are used directly. Includes unit tests for the new validation and for the file-system clients.
1 parent 66b4b5b commit 0cac092

13 files changed

Lines changed: 269 additions & 11 deletions

File tree

src/crawlee/_utils/file.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,39 @@ def _write_file(path: Path, data: str | bytes) -> None:
6060
raise
6161

6262

63+
def validate_subdirectory(base_dir: Path, subdirectory: str) -> Path:
64+
"""Resolve a storage subdirectory inside a base directory.
65+
66+
Joins `subdirectory` onto `base_dir` and verifies that the result is a direct child of `base_dir`, so a
67+
storage name or alias always maps to a single subdirectory under the storage directory rather than a nested
68+
path (e.g. `nested/inside`) or somewhere else entirely (e.g. a value containing `..` or an absolute path).
69+
70+
Args:
71+
base_dir: The base storage directory (e.g. the `key_value_stores` directory).
72+
subdirectory: The storage name or alias to use as the subdirectory.
73+
74+
Returns:
75+
The validated full path to the storage subdirectory.
76+
77+
Raises:
78+
ValueError: If the resolved path is not a direct child of `base_dir`.
79+
"""
80+
# Normalize lexically (no filesystem access), so symlinks are not followed and the check is deterministic.
81+
base_resolved = Path(os.path.normpath(base_dir))
82+
target_resolved = Path(os.path.normpath(base_dir / subdirectory))
83+
84+
# The target must be a direct child of the base directory, so it maps to a single subdirectory - reject path
85+
# separators, parent directory references ("..") and absolute paths.
86+
if target_resolved.parent != base_resolved:
87+
raise ValueError(
88+
f'Invalid storage name or alias "{subdirectory}". It must map to a single subdirectory under the '
89+
f'storage directory and must not contain path separators, parent directory references ("..") or '
90+
f'absolute paths.'
91+
)
92+
93+
return target_resolved
94+
95+
6396
def infer_mime_type(value: Any) -> str:
6497
"""Infer the MIME content type from the value.
6598

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from crawlee._consts import METADATA_FILENAME
1616
from crawlee._types import JsonSerializable
1717
from crawlee._utils.crypto import crypto_random_object_id
18-
from crawlee._utils.file import atomic_write, json_dumps
18+
from crawlee._utils.file import atomic_write, json_dumps, validate_subdirectory
1919
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
2020
from crawlee.storage_clients._base import DatasetClient
2121
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
@@ -159,8 +159,7 @@ async def open(
159159

160160
# Get a new instance by name or alias.
161161
else:
162-
dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
163-
path_to_dataset = dataset_base_path / dataset_dir
162+
path_to_dataset = validate_subdirectory(dataset_base_path, name or alias or 'default')
164163
path_to_metadata = path_to_dataset / METADATA_FILENAME
165164

166165
# If the dataset directory exists, reconstruct the client from the metadata file.

src/crawlee/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from crawlee._consts import METADATA_FILENAME
1717
from crawlee._utils.crypto import crypto_random_object_id
18-
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
18+
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps, validate_subdirectory
1919
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
2020
from crawlee.storage_clients._base import KeyValueStoreClient
2121
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
@@ -157,8 +157,7 @@ async def open(
157157

158158
# Get a new instance by name or alias.
159159
else:
160-
kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
161-
path_to_kvs = kvs_base_path / kvs_dir
160+
path_to_kvs = validate_subdirectory(kvs_base_path, name or alias or 'default')
162161
path_to_metadata = path_to_kvs / METADATA_FILENAME
163162

164163
# If the key-value store directory exists, reconstruct the client from the metadata file.

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from crawlee import Request
1818
from crawlee._consts import METADATA_FILENAME
1919
from crawlee._utils.crypto import crypto_random_object_id
20-
from crawlee._utils.file import atomic_write, json_dumps
20+
from crawlee._utils.file import atomic_write, json_dumps, validate_subdirectory
2121
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
2222
from crawlee._utils.recoverable_state import RecoverableState
2323
from crawlee.storage_clients._base import RequestQueueClient
@@ -227,8 +227,7 @@ async def open(
227227

228228
# Open an existing RQ by its name or alias, or create a new one if not found.
229229
else:
230-
rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
231-
path_to_rq = rq_base_path / rq_dir
230+
path_to_rq = validate_subdirectory(rq_base_path, name or alias or 'default')
232231
path_to_metadata = path_to_rq / METADATA_FILENAME
233232

234233
# If the RQ directory exists, reconstruct the client from the metadata file.

src/crawlee/storages/_storage_instance_manager.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
1111
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
1212

13-
from ._utils import validate_storage_name
13+
from ._utils import validate_storage_alias, validate_storage_name
1414

1515
if TYPE_CHECKING:
1616
from ._base import Storage
@@ -135,6 +135,10 @@ async def open_storage_instance(
135135
if name is not None:
136136
validate_storage_name(name)
137137

138+
# Validate storage alias
139+
if alias is not None:
140+
validate_storage_alias(alias)
141+
138142
# Acquire lock for this opener
139143
opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
140144
if not (lock := self._opener_locks.get(opener_lock_key)):

src/crawlee/storages/_utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,20 @@ def validate_storage_name(name: str | None) -> None:
99
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
1010
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
1111
)
12+
13+
14+
def validate_storage_alias(alias: str | None) -> None:
15+
"""Validate a storage alias that is used as an on-disk subdirectory name.
16+
17+
Unlike storage names, aliases may contain underscores and dots (e.g. the reserved `__default__` alias),
18+
but must not contain path separators or parent-directory references, so an alias always maps to a single
19+
directory under the storage directory.
20+
"""
21+
if alias is None:
22+
return
23+
24+
if not alias or '/' in alias or '\\' in alias or '\x00' in alias or alias in {'.', '..'}:
25+
raise ValueError(
26+
f'Invalid storage alias "{alias}". Alias must not be empty, contain path separators ("/", "\\") or '
27+
f'null bytes, or be a directory reference (".", "..").'
28+
)

tests/unit/_utils/test_file.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
from __future__ import annotations
22

33
from datetime import datetime, timezone
4+
from typing import TYPE_CHECKING
45

5-
from crawlee._utils.file import json_dumps
6+
import pytest
7+
8+
from crawlee._utils.file import json_dumps, validate_subdirectory
9+
10+
if TYPE_CHECKING:
11+
from pathlib import Path
612

713

814
async def test_json_dumps() -> None:
@@ -11,3 +17,42 @@ async def test_json_dumps() -> None:
1117
assert await json_dumps('string') == '"string"'
1218
assert await json_dumps(123) == '123'
1319
assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"'
20+
21+
22+
# Tests for validate_subdirectory (storage name/alias directory validation).
23+
24+
25+
@pytest.mark.parametrize(
26+
'subdirectory',
27+
[
28+
pytest.param('my-store', id='simple'),
29+
pytest.param('store_with_underscores', id='underscores'),
30+
pytest.param('store.with.dots', id='dots'),
31+
pytest.param('__default__', id='reserved-default'),
32+
],
33+
)
34+
def test_validate_subdirectory_accepts_safe_segments(tmp_path: Path, subdirectory: str) -> None:
35+
base_dir = tmp_path / 'key_value_stores'
36+
result = validate_subdirectory(base_dir, subdirectory)
37+
# The resolved path must be a direct child of the base directory.
38+
assert result.parent == base_dir
39+
40+
41+
@pytest.mark.parametrize(
42+
'subdirectory',
43+
[
44+
pytest.param('../outside', id='parent-ref'),
45+
pytest.param('../../outside', id='deep-parent-ref'),
46+
pytest.param('..', id='bare-parent'),
47+
pytest.param('.', id='bare-current'),
48+
pytest.param('a/../../outside', id='mixed-parent-ref'),
49+
pytest.param('/etc/passwd', id='absolute-path'),
50+
pytest.param('', id='empty'),
51+
pytest.param('nested/inside', id='nested-path'),
52+
pytest.param('with/slash', id='with-slash'),
53+
],
54+
)
55+
def test_validate_subdirectory_rejects_invalid_segments(tmp_path: Path, subdirectory: str) -> None:
56+
base_dir = tmp_path / 'key_value_stores'
57+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
58+
validate_subdirectory(base_dir, subdirectory)

tests/unit/storage_clients/_file_system/test_fs_dataset_client.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,33 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
5151
await client.drop()
5252

5353

54+
@pytest.mark.parametrize(
55+
'invalid_value',
56+
[
57+
pytest.param('../outside', id='parent-ref'),
58+
pytest.param('..', id='bare-parent'),
59+
pytest.param('/abs/outside', id='absolute-path'),
60+
],
61+
)
62+
async def test_open_rejects_invalid_name_or_alias(
63+
configuration: Configuration, tmp_path: Path, invalid_value: str
64+
) -> None:
65+
"""The low-level client must reject names/aliases that resolve outside the storage directory.
66+
67+
This covers direct usage of the storage client, which bypasses the high-level validation.
68+
"""
69+
storage_client = FileSystemStorageClient()
70+
71+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
72+
await storage_client.create_dataset_client(alias=invalid_value, configuration=configuration)
73+
74+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
75+
await storage_client.create_dataset_client(name=invalid_value, configuration=configuration)
76+
77+
# Nothing should have been written outside the configured storage directory.
78+
assert not (tmp_path / 'outside').exists()
79+
80+
5481
async def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None:
5582
"""Test that data is properly persisted to files with correct content."""
5683
item = {'key': 'value', 'number': 42}

tests/unit/storage_clients/_file_system/test_fs_kvs_client.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,33 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
4949
await client.drop()
5050

5151

52+
@pytest.mark.parametrize(
53+
'invalid_value',
54+
[
55+
pytest.param('../outside', id='parent-ref'),
56+
pytest.param('..', id='bare-parent'),
57+
pytest.param('/abs/outside', id='absolute-path'),
58+
],
59+
)
60+
async def test_open_rejects_invalid_name_or_alias(
61+
configuration: Configuration, tmp_path: Path, invalid_value: str
62+
) -> None:
63+
"""The low-level client must reject names/aliases that resolve outside the storage directory.
64+
65+
This covers direct usage of the storage client, which bypasses the high-level validation.
66+
"""
67+
storage_client = FileSystemStorageClient()
68+
69+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
70+
await storage_client.create_kvs_client(alias=invalid_value, configuration=configuration)
71+
72+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
73+
await storage_client.create_kvs_client(name=invalid_value, configuration=configuration)
74+
75+
# Nothing should have been written outside the configured storage directory.
76+
assert not (tmp_path / 'outside').exists()
77+
78+
5279
async def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None:
5380
"""Test that values are properly persisted to files with correct content and metadata."""
5481
test_key = 'test-key'

tests/unit/storage_clients/_file_system/test_fs_rq_client.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,33 @@ async def test_file_and_directory_creation() -> None:
5151
await client.drop()
5252

5353

54+
@pytest.mark.parametrize(
55+
'invalid_value',
56+
[
57+
pytest.param('../outside', id='parent-ref'),
58+
pytest.param('..', id='bare-parent'),
59+
pytest.param('/abs/outside', id='absolute-path'),
60+
],
61+
)
62+
async def test_open_rejects_invalid_name_or_alias(
63+
configuration: Configuration, tmp_path: Path, invalid_value: str
64+
) -> None:
65+
"""The low-level client must reject names/aliases that resolve outside the storage directory.
66+
67+
This covers direct usage of the storage client, which bypasses the high-level validation.
68+
"""
69+
storage_client = FileSystemStorageClient()
70+
71+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
72+
await storage_client.create_rq_client(alias=invalid_value, configuration=configuration)
73+
74+
with pytest.raises(ValueError, match='Invalid storage name or alias'):
75+
await storage_client.create_rq_client(name=invalid_value, configuration=configuration)
76+
77+
# Nothing should have been written outside the configured storage directory.
78+
assert not (tmp_path / 'outside').exists()
79+
80+
5481
async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None:
5582
"""Test that requests are properly persisted to files."""
5683
requests = [

0 commit comments

Comments
 (0)