Skip to content

Commit d1680b9

Browse files
committed
fix: NFC-normalize project_key; retry rmtree on transient errors; skip materialize w/ custom transport
- project_key_for_directory() and _project_key_from_dir() now route through _canonicalize_path() (realpath + NFC) so the SDK's project_key matches the CLI's on filesystems that decompose Unicode (macOS HFS+). Without this, store.load() silently misses on paths like 'café' stored as 'cafe\u0301'. - Add _rmtree_with_retry() in session_resume.py: retries shutil.rmtree on EBUSY/EPERM/EACCES/ENOTEMPTY/EMFILE/ENFILE up to 4 times with 100ms backoff, then falls back to ignore_errors=True. Replaces both rmtree call sites (failure-path cleanup and the cleanup() callback). On Windows, AV/indexer briefly holding .credentials.json no longer leaks the access token in temp. - Gate materialize_resume_session() on no custom transport in both ClaudeSDKClient.connect() and InternalClient.process_query(). A pre-constructed transport never sees the materialized options, so loading the store and writing .credentials.json to a temp dir was wasted work that left the access token on disk for the session lifetime. Tests: +5 (NFC normalization, rmtree retry on cleanup() and failure-path, custom-transport skip on connect() and query()).
1 parent ca734db commit d1680b9

7 files changed

Lines changed: 241 additions & 13 deletions

File tree

src/claude_agent_sdk/_internal/client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,12 @@ async def process_query(
6060

6161
# resume/continue + session_store: load the session from the store
6262
# into a temp CLAUDE_CONFIG_DIR for the subprocess to resume from.
63-
materialized = await materialize_resume_session(options)
63+
# Skipped when a custom transport was supplied — the materialized
64+
# options never reach a pre-constructed transport, so loading the
65+
# store and writing .credentials.json to a temp dir would be wasted.
66+
materialized = (
67+
await materialize_resume_session(options) if transport is None else None
68+
)
6469
try:
6570
async for msg in self._process_query_inner(
6671
prompt, options, transport, materialized

src/claude_agent_sdk/_internal/session_resume.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from __future__ import annotations
1515

1616
import asyncio
17+
import errno
1718
import getpass
1819
import json
1920
import logging
@@ -119,11 +120,11 @@ async def materialize_resume_session(
119120
# so asyncio.CancelledError (BaseException since 3.8) also triggers
120121
# cleanup — callers can't compensate because the assignment raises
121122
# before completing.
122-
shutil.rmtree(tmp_base, ignore_errors=True)
123+
await _rmtree_with_retry(tmp_base)
123124
raise
124125

125126
async def cleanup() -> None:
126-
shutil.rmtree(tmp_base, ignore_errors=True)
127+
await _rmtree_with_retry(tmp_base)
127128

128129
return MaterializedResume(
129130
config_dir=tmp_base,
@@ -136,6 +137,52 @@ async def cleanup() -> None:
136137
# Helpers
137138
# ---------------------------------------------------------------------------
138139

140+
# OSError errnos that indicate a transiently-held handle (Windows AV/indexer
141+
# scanning a freshly-written file) rather than a permanent failure.
142+
_RETRYABLE_RMTREE_ERRNOS = frozenset(
143+
{
144+
errno.EBUSY,
145+
errno.EMFILE,
146+
errno.ENFILE,
147+
errno.ENOTEMPTY,
148+
errno.EPERM,
149+
errno.EACCES,
150+
}
151+
)
152+
153+
154+
async def _rmtree_with_retry(
155+
path: Path, *, retries: int = 4, delay: float = 0.1
156+
) -> None:
157+
"""Best-effort ``shutil.rmtree`` with retries on transient lock errors.
158+
159+
On Windows, AV/indexer can briefly hold a handle on freshly-written
160+
files (notably ``.credentials.json``), causing rmtree to fail with
161+
EBUSY/EPERM. Retry a few times with a short backoff; after exhausting
162+
retries, fall back to ``ignore_errors=True`` (matches the previous
163+
behavior, but gives the handle a chance to release first so the access
164+
token doesn't leak in temp). Never raises.
165+
"""
166+
if not path.exists():
167+
return
168+
for _ in range(retries):
169+
try:
170+
shutil.rmtree(path)
171+
return
172+
except OSError as e:
173+
if e.errno not in _RETRYABLE_RMTREE_ERRNOS and not isinstance(
174+
e, PermissionError
175+
):
176+
break
177+
try:
178+
await asyncio.sleep(delay)
179+
except asyncio.CancelledError:
180+
# Best-effort final sweep before propagating cancellation so a
181+
# cancelled connect() doesn't leak the temp dir.
182+
shutil.rmtree(path, ignore_errors=True)
183+
raise
184+
shutil.rmtree(path, ignore_errors=True)
185+
139186

140187
async def _load_candidate(
141188
store: SessionStore, project_key: str, session_id: str, timeout_s: float

src/claude_agent_sdk/_internal/session_store.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
SessionStoreEntry,
1414
SessionStoreListEntry,
1515
)
16-
from .sessions import _sanitize_path
16+
from .sessions import _canonicalize_path, _sanitize_path
1717

1818

1919
def _key_to_string(key: SessionKey) -> str:
@@ -150,9 +150,11 @@ def file_path_to_session_key(file_path: str, projects_dir: str) -> SessionKey |
150150
def project_key_for_directory(directory: str | Path | None = None) -> str:
151151
"""Derive the :class:`SessionStore` ``project_key`` for a directory.
152152
153-
Defaults to the current working directory. Uses the same djb2-hashed
154-
sanitization the CLI uses for project directory names, so keys match
155-
between local-disk transcripts and store-mirrored transcripts.
153+
Defaults to the current working directory. Uses the same realpath + NFC
154+
normalization + djb2-hashed sanitization the CLI uses for project
155+
directory names, so keys match between local-disk transcripts and
156+
store-mirrored transcripts even on filesystems that decompose Unicode
157+
(macOS HFS+).
156158
"""
157-
abs_path = Path(directory if directory is not None else ".").resolve()
158-
return _sanitize_path(str(abs_path))
159+
abs_path = _canonicalize_path(str(directory) if directory is not None else ".")
160+
return _sanitize_path(abs_path)

src/claude_agent_sdk/_internal/sessions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1410,8 +1410,8 @@ def _project_key_from_dir(directory: str | None) -> str:
14101410
:func:`project_key_for_directory` but is defined locally to avoid a
14111411
circular import with ``session_store.py``.
14121412
"""
1413-
abs_path = Path(directory if directory is not None else ".").resolve()
1414-
return _sanitize_path(str(abs_path))
1413+
abs_path = _canonicalize_path(directory if directory is not None else ".")
1414+
return _sanitize_path(abs_path)
14151415

14161416

14171417
def _entries_to_jsonl(entries: list[Any]) -> str:

src/claude_agent_sdk/client.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,15 @@ async def _empty_stream() -> AsyncIterator[dict[str, Any]]:
121121
# into a temp CLAUDE_CONFIG_DIR for the subprocess to resume from.
122122
# When materialized, override resume/continue/env on a copy of options
123123
# so the subprocess points at the temp dir; when None, fall through
124-
# to normal handling (fresh session or local-disk resume).
125-
self._materialized = await materialize_resume_session(self.options)
124+
# to normal handling (fresh session or local-disk resume). Skipped
125+
# when a custom transport was supplied — the materialized options
126+
# never reach a pre-constructed transport, so loading the store and
127+
# writing .credentials.json to a temp dir would be wasted work.
128+
self._materialized = (
129+
await materialize_resume_session(self.options)
130+
if self._custom_transport is None
131+
else None
132+
)
126133
try:
127134
await self._connect_inner(prompt, actual_prompt)
128135
except BaseException:

tests/test_session_resume.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from __future__ import annotations
44

55
import asyncio
6+
import errno
67
import json
8+
import shutil
79
import tempfile
810
import uuid
911
from pathlib import Path
@@ -755,6 +757,79 @@ def capture_transport(*, prompt, options):
755757
# Cleanup removed the temp dir.
756758
assert not Path(config_dir).exists()
757759

760+
@pytest.mark.asyncio
761+
async def test_custom_transport_skips_materialization(
762+
self,
763+
cwd: Path,
764+
project_key: str,
765+
isolated_home: Path,
766+
track_resume_dirs: list[Path],
767+
) -> None:
768+
"""A pre-constructed custom transport never sees the materialized
769+
options, so loading the store and writing .credentials.json to a
770+
temp dir would be wasted (and leave the access token on disk for
771+
the session lifetime). connect() must skip materialization."""
772+
773+
class SpyStore(InMemorySessionStore):
774+
load_calls = 0
775+
776+
async def load(self, key): # type: ignore[override]
777+
SpyStore.load_calls += 1
778+
return await super().load(key)
779+
780+
store = SpyStore()
781+
await store.append(
782+
{"project_key": project_key, "session_id": SESSION_ID},
783+
[{"type": "user", "uuid": "u1"}],
784+
)
785+
786+
opts = ClaudeAgentOptions(cwd=cwd, session_store=store, resume=SESSION_ID)
787+
client = ClaudeSDKClient(options=opts, transport=_make_mock_transport())
788+
789+
with patch(
790+
"claude_agent_sdk._internal.query.Query.initialize",
791+
new_callable=AsyncMock,
792+
):
793+
await client.connect()
794+
assert SpyStore.load_calls == 0
795+
assert not track_resume_dirs # mkdtemp never ran
796+
assert client._materialized is None
797+
await client.disconnect()
798+
799+
@pytest.mark.asyncio
800+
async def test_query_custom_transport_skips_materialization(
801+
self,
802+
cwd: Path,
803+
project_key: str,
804+
isolated_home: Path,
805+
track_resume_dirs: list[Path],
806+
) -> None:
807+
"""Same gate for the one-shot ``query()`` path."""
808+
809+
class SpyStore(InMemorySessionStore):
810+
load_calls = 0
811+
812+
async def load(self, key): # type: ignore[override]
813+
SpyStore.load_calls += 1
814+
return await super().load(key)
815+
816+
store = SpyStore()
817+
await store.append(
818+
{"project_key": project_key, "session_id": SESSION_ID},
819+
[{"type": "user", "uuid": "u1"}],
820+
)
821+
822+
opts = ClaudeAgentOptions(cwd=cwd, session_store=store, resume=SESSION_ID)
823+
custom = _make_mock_transport()
824+
custom.connect = AsyncMock(side_effect=OSError("spawn failed"))
825+
with pytest.raises(OSError, match="spawn failed"):
826+
async for _ in query(prompt="hi", options=opts, transport=custom):
827+
pass # pragma: no cover
828+
829+
# Gate runs before transport.connect(); materialization never happened.
830+
assert SpyStore.load_calls == 0
831+
assert not track_resume_dirs
832+
758833
@pytest.mark.asyncio
759834
async def test_connect_no_materialization_passthrough(
760835
self, cwd: Path, isolated_home: Path
@@ -815,6 +890,84 @@ class TestSpawnFailureCleanup:
815890
removed even when transport.connect() raises before any try/finally that
816891
normally guards cleanup."""
817892

893+
@pytest.mark.asyncio
894+
async def test_cleanup_retries_on_transient_os_error(
895+
self,
896+
cwd: Path,
897+
project_key: str,
898+
isolated_home: Path,
899+
monkeypatch: pytest.MonkeyPatch,
900+
) -> None:
901+
"""Windows AV/indexer can briefly hold ``.credentials.json`` open;
902+
``cleanup()`` must retry rmtree on EPERM/EBUSY so the access token
903+
doesn't leak in temp."""
904+
store = InMemorySessionStore()
905+
await store.append(
906+
{"project_key": project_key, "session_id": SESSION_ID},
907+
[{"type": "user", "uuid": "u1"}],
908+
)
909+
opts = ClaudeAgentOptions(cwd=cwd, session_store=store, resume=SESSION_ID)
910+
m = await materialize_resume_session(opts)
911+
assert m is not None
912+
config_dir = m.config_dir
913+
914+
calls: list[Any] = []
915+
real_rmtree = shutil.rmtree
916+
917+
def fake_rmtree(p: Any, **kw: Any) -> None:
918+
calls.append((p, kw))
919+
if len(calls) <= 2 and not kw.get("ignore_errors"):
920+
raise PermissionError(errno.EPERM, "held by indexer")
921+
if Path(p).exists():
922+
real_rmtree(p, **kw)
923+
924+
monkeypatch.setattr(shutil, "rmtree", fake_rmtree)
925+
await m.cleanup()
926+
927+
assert not config_dir.exists()
928+
assert len(calls) >= 3 # 2 failures + 1 success
929+
930+
@pytest.mark.asyncio
931+
async def test_failure_path_retries_rmtree(
932+
self,
933+
cwd: Path,
934+
project_key: str,
935+
isolated_home: Path,
936+
monkeypatch: pytest.MonkeyPatch,
937+
track_resume_dirs: list[Path],
938+
) -> None:
939+
"""The except-BaseException cleanup path also retries on EPERM."""
940+
941+
class FailLateStore(InMemorySessionStore):
942+
async def list_subkeys(self, key): # type: ignore[override]
943+
raise OSError("boom")
944+
945+
store = FailLateStore()
946+
await store.append(
947+
{"project_key": project_key, "session_id": SESSION_ID},
948+
[{"type": "user", "uuid": "u1"}],
949+
)
950+
951+
calls: list[Any] = []
952+
real_rmtree = shutil.rmtree
953+
954+
def fake_rmtree(p: Any, **kw: Any) -> None:
955+
calls.append((p, kw))
956+
if len(calls) <= 2 and not kw.get("ignore_errors"):
957+
raise PermissionError(errno.EPERM, "held by indexer")
958+
if Path(p).exists():
959+
real_rmtree(p, **kw)
960+
961+
monkeypatch.setattr(shutil, "rmtree", fake_rmtree)
962+
963+
opts = ClaudeAgentOptions(cwd=cwd, session_store=store, resume=SESSION_ID)
964+
with pytest.raises(RuntimeError, match="boom"):
965+
await materialize_resume_session(opts)
966+
967+
assert track_resume_dirs
968+
assert not track_resume_dirs[0].exists()
969+
assert len(calls) >= 3
970+
818971
@pytest.mark.asyncio
819972
async def test_client_connect_failure_removes_temp_dir(
820973
self,

tests/test_session_store_conformance.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,20 @@ def test_relative_dir_resolved_to_absolute_before_hashing(self) -> None:
228228
assert key == _sanitize_path(str(Path().resolve()))
229229
assert key != _sanitize_path(".")
230230

231+
def test_nfc_normalizes_decomposed_unicode(self, tmp_path) -> None:
232+
"""Parity with TS: the CLI canonicalizes via realpath + NFC. On macOS
233+
HFS+ a path like ``café`` may be stored decomposed (``cafe\\u0301``);
234+
without NFC normalization the SDK's project_key would mismatch the
235+
CLI's and store.load() would silently miss."""
236+
import unicodedata
237+
238+
nfc = tmp_path / unicodedata.normalize("NFC", "café")
239+
nfd = tmp_path / unicodedata.normalize("NFD", "café")
240+
nfc.mkdir(exist_ok=True)
241+
assert project_key_for_directory(str(nfc)) == project_key_for_directory(
242+
str(nfd)
243+
)
244+
231245
def test_long_path_uses_portable_djb2_suffix(self) -> None:
232246
"""Parity with TS: paths > MAX_SANITIZED_LENGTH get a djb2 hash
233247
suffix (runtime-portable so parent and subprocess derive the

0 commit comments

Comments
 (0)