fix(tokenizer): guard the tiktoken cache with a threading.Lock

Brad Kinnard · Brad Kinnard · commit 8f7a0012e093 · 2026-05-27T08:11:44.000-06:00
The lazy-init code read and wrote two module-level globals without
synchronization. Two threads racing on first init could both observe
the untested state and both call tiktoken.get_encoding, then write
different encoding objects to the cache. The double-checked-locking
pattern keeps the fast path lock-free after init while making the
first-init transition atomic. The new concurrent-first-init test runs
eight workers through _get_tiktoken_enc and asserts they all see the
same cached object.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- `tokenizer._get_tiktoken_enc` is now thread-safe. The first-init fast path takes a module-level lock and re-checks the cached state, so concurrent callers (e.g. editor plugins running in a worker pool) cannot race on `tiktoken.get_encoding`.
 - `_is_action_verb` now recognizes `-ing` and `-ed` inflections, including the e-drop ("validating" -> "validate") and doubled-consonant ("scanning" -> "scan") forms. Descriptions like "Validating skills..." or "Used for..." used to score 0 on the action axis even though the leading word was a clear action verb.
 - `--format` error message now lists `github` as a valid choice (`cli.py`). The argparse `choices=` set on the `--format` definition has accepted `github` since v1.2.3, but the post-parse runtime check still printed the pre-v1.2.3 four-value list.
 
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@
 
 Static analyzer for `SKILL.md` files. Validates frontmatter, body sizing, file references, and cross-agent compatibility against the [agentskills.io specification](https://agentskills.io/specification). No network calls. No LLM API calls. No file mutations.
 
-785 tests cover all rule modules.
+786 tests cover all rule modules.
 
 ## Install
 
diff --git a/src/skillcheck/tokenizer.py b/src/skillcheck/tokenizer.py
@@ -1,4 +1,5 @@
 import re
+import threading
 from typing import Any
 
 # Two patterns that cover the token-relevant structure of BPE tokenization:
@@ -18,22 +19,30 @@
 
 # Lazy-cached tiktoken encoding.  The BPE merge table is allocated once on
 # first use and reused for all subsequent calls, avoiding the per-call
-# overhead of ``tiktoken.get_encoding()``.
+# overhead of ``tiktoken.get_encoding()``.  The lock guards the first-init
+# fast path so concurrent worker threads (editor plugins planned for
+# future use) cannot both observe the untested state and race on
+# ``tiktoken.get_encoding``.
 _tiktoken_enc: Any | None = None
 _tiktoken_available: bool | None = None  # tri-state: None = untested
+_tiktoken_lock = threading.Lock()
 
 
 def _get_tiktoken_enc() -> Any | None:
     """Return a cached tiktoken ``Encoding``, or *None* if unavailable."""
     global _tiktoken_enc, _tiktoken_available  # noqa: PLW0603
-    if _tiktoken_available is None:
-        try:
-            import tiktoken  # type: ignore[import-untyped]
-            _tiktoken_enc = tiktoken.get_encoding("cl100k_base")
-            _tiktoken_available = True
-        except ImportError:
-            _tiktoken_available = False
-    return _tiktoken_enc
+    if _tiktoken_available is not None:
+        return _tiktoken_enc
+    with _tiktoken_lock:
+        # Re-check under the lock so the second arrival sees the cached value.
+        if _tiktoken_available is None:
+            try:
+                import tiktoken  # type: ignore[import-untyped]
+                _tiktoken_enc = tiktoken.get_encoding("cl100k_base")
+                _tiktoken_available = True
+            except ImportError:
+                _tiktoken_available = False
+        return _tiktoken_enc
 
 
 def estimate_tokens(text: str) -> int:
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -1,5 +1,7 @@
 """Tests for Fix 1: Tokenizer lazy caching."""
 
+from concurrent.futures import ThreadPoolExecutor
+
 from skillcheck.tokenizer import _get_tiktoken_enc, estimate_tokens
 
 
@@ -30,3 +32,18 @@ def test_tiktoken_enc_cached():
 def test_empty_string_returns_one():
     """Empty string returns at least 1 (the floor)."""
     assert estimate_tokens("") >= 1
+
+
+def test_tokenizer_concurrent_first_init_is_consistent():
+    """Concurrent first-init calls return the same cached encoding instance.
+
+    The first observer pays the tiktoken.get_encoding cost; subsequent
+    observers find the cache populated under the lock and return the
+    same object (or all None when tiktoken is not installed).
+    """
+    with ThreadPoolExecutor(max_workers=8) as pool:
+        results = list(pool.map(lambda _: _get_tiktoken_enc(), range(8)))
+    distinct = {id(r) for r in results}
+    assert len(distinct) == 1, (
+        f"Concurrent first-init produced different encoding objects: {distinct}"
+    )