Skip to content

Commit f9fab48

Browse files
cdeustclaude
andauthored
fix(wiki): plug slug/title leaks producing .md.md, timestamp-slugs, path-titles (#26)
* fix(wiki): plug slug/title leaks producing .md.md, timestamp-slugs, path-titles Audit of the methodology wiki (7,883 pages) on 2026-05-12 found: - 58 pages with .md.md double extension - 10 ADRs slugged "decision-created-2026-04-15t09-29-10z" from YAML frontmatter timestamps leaking through as titles - 11+ pages with embedded filesystem paths in slugs, e.g. "specs/2026-04-17-also-on-users-cdeust-documents-developments-..." - All polluted pages written 2026-04-21 — *after* v3.10.1 (2026-04-15) shipped the audit-artefact filter, so these are live bugs not history. Root causes and fixes: 1. .md.md (slugify preserves '.') — wiki_layout.slugify now strips a trailing chain of ".md" before returning. Single-point repair benefits all six filename builders (adr_filename, domain_page_path, wiki_sync, draft_compiler, ingest_prd, ingest_codebase_pages). 2. Timestamp-as-title (YAML "created:" line accepted by derive_title) — new _YAML_KV_TITLE_PATTERNS reject lines matching "(created|updated|date|...): <value>" and bare ISO-8601 timestamps. 3. Path-embedded-mid-sentence titles (existing _PATH_OR_URL_TITLE_PATTERNS only matched paths at line start) — added two patterns that match /Users/, /home/, /opt/, /var/, /etc/, /tmp/, /root/ anywhere in the line, plus Windows drive paths anywhere. 4. derive_title fallback leaked raw content[:80] when no clean line existed, defeating wiki_sync's deterministic memory-<hash> fallback. derive_title now returns "" when every candidate line is rejected (path, URL, YAML, JSON, too-short); callers route to the hash. Regression tests in test_wiki_layout.py (4 new) and test_wiki_classifier.py (6 new). 1720 core tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * style: ruff format test_wiki_classifier (CI fix) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 2f42428 commit f9fab48

4 files changed

Lines changed: 172 additions & 16 deletions

File tree

mcp_server/core/wiki_classifier.py

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,30 @@
169169
r"^\s*#*\s*[\w.-]+\.(pdf|png|jpg|jpeg|svg|gif|zip|tar\.gz|docx?|xlsx?|csv|log|yaml|yml)\b",
170170
re.IGNORECASE,
171171
),
172+
# Path embedded mid-line ("also on /Users/...", "fix the file at C:\\..."):
173+
# any absolute POSIX path or Windows drive path anywhere in the title.
174+
# Bug found 2026-05-12: pages like
175+
# specs/2026-04-17-also-on-users-cdeust-documents-developments-...md
176+
# passed the start-of-line check, then slugify stripped the leading "/"
177+
# and folded the entire path into the slug.
178+
re.compile(r"(?:^|\s)/(Users|home|root|opt|var|etc|tmp)/", re.IGNORECASE),
179+
re.compile(r"(?:^|\s)[A-Za-z]:[\\/]"),
180+
]
181+
182+
# YAML-frontmatter / key:value lines that leaked through as titles when the
183+
# real title was missing. Reject lines that are purely "key: value" where
184+
# the value is a timestamp, identifier, or boolean — they're metadata,
185+
# not titles. Bug found 2026-05-12: 10 ADRs slugged as
186+
# "decision-created-2026-04-15t09-29-10z" because the YAML "created:"
187+
# line was the first non-{}/[] line in the body.
188+
_YAML_KV_TITLE_PATTERNS = [
189+
# `created: 2026-04-15T09:29:10Z`, `updated: 2026-04-15`, `date: ...`
190+
re.compile(
191+
r"^\s*(created|updated|date|timestamp|time|id|uuid|version)\s*:\s*\S",
192+
re.IGNORECASE,
193+
),
194+
# Bare ISO-8601 timestamp anywhere (would slug to t09-29-10z form)
195+
re.compile(r"\b\d{4}-\d{2}-\d{2}T\d{2}[:-]\d{2}[:-]\d{2}", re.IGNORECASE),
172196
]
173197

174198
# Session / audit artefact tags — these are recall-fodder, not wiki-worthy.
@@ -538,6 +562,27 @@ def classify_memory(content: str, tags: list[str] | None = None) -> str | None:
538562
return "note"
539563

540564

565+
def _line_is_title_candidate(cleaned: str) -> bool:
566+
"""Return True iff ``cleaned`` is acceptable as a wiki page title.
567+
568+
Rejects: empty/short, JSON braces, embedded paths/URLs, YAML metadata
569+
key:value lines, bare timestamps. Callers that get False from every
570+
candidate line should yield an empty title and let the deterministic
571+
hash fallback kick in (see ``wiki_sync._sync_to_wiki``).
572+
"""
573+
if len(cleaned) <= 10:
574+
return False
575+
if cleaned.startswith("{") or cleaned.startswith("["):
576+
return False
577+
for pat in _PATH_OR_URL_TITLE_PATTERNS:
578+
if pat.search(cleaned):
579+
return False
580+
for pat in _YAML_KV_TITLE_PATTERNS:
581+
if pat.search(cleaned):
582+
return False
583+
return True
584+
585+
541586
def derive_title(
542587
content: str,
543588
kind: str,
@@ -548,31 +593,25 @@ def derive_title(
548593
549594
Strategy (Alexander P4 + Eco):
550595
1. Strip known prefixes
551-
2. Use kind-specific extraction
552-
3. Fall back to entity-based or tag-based title
553-
4. Last resort: first meaningful sentence
596+
2. Walk lines; accept the first that passes ``_line_is_title_candidate``
597+
3. Fall back to entity-based title if 2+ entities are supplied
598+
4. Otherwise return "" — caller is responsible for a deterministic
599+
fallback (e.g. ``memory-<hash>``). Returning a raw 80-char content
600+
prefix here used to leak filesystem paths, timestamps, and sentence
601+
fragments into slugs.
554602
"""
555-
# Get clean first line
556603
lines = content.strip().split("\n")
557604
first_meaningful = ""
558605
for line in lines:
559606
cleaned = line.strip()
560607
for pat in _TITLE_STRIP_PREFIXES:
561608
cleaned = pat.sub("", cleaned).strip()
562-
if (
563-
len(cleaned) > 10
564-
and not cleaned.startswith("{")
565-
and not cleaned.startswith("[")
566-
):
609+
if _line_is_title_candidate(cleaned):
567610
first_meaningful = cleaned
568611
break
569612

570-
if not first_meaningful:
571-
first_meaningful = content.strip()[:80]
572-
573613
# Truncate to reasonable title length
574614
if len(first_meaningful) > 80:
575-
# Cut at word boundary
576615
first_meaningful = first_meaningful[:77].rsplit(" ", 1)[0] + "..."
577616

578617
# Kind-specific prefixing for clarity
@@ -591,6 +630,9 @@ def derive_title(
591630
return f"{prefix}: {entity_title}"
592631
return entity_title
593632

633+
if not first_meaningful:
634+
return ""
635+
594636
if prefix and not first_meaningful.lower().startswith(prefix.lower()):
595637
return f"{prefix}: {first_meaningful}"
596638

mcp_server/core/wiki_layout.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,29 @@
3232

3333
_SAFE = re.compile(r"[^a-zA-Z0-9_.-]+")
3434
_MAX_SLUG_LEN = 80
35+
# Trailing extension tokens to strip before slug consumers append ".md".
36+
# Without this strip, an input title that already looks like a filename
37+
# (e.g. "001-zero-dependencies.md") produced filenames like
38+
# "2234-001-zero-dependencies.md.md" because every caller in the wiki
39+
# layer appends ".md" unconditionally. Strip iteratively so e.g.
40+
# "foo.md.md" → "foo".
41+
_TRAILING_MD_EXT = re.compile(r"(?:\.md)+$", re.IGNORECASE)
3542

3643

3744
def slugify(value: str, *, max_len: int = _MAX_SLUG_LEN) -> str:
38-
"""Stable filesystem-safe slug. Deterministic, lowercased, length-capped."""
45+
"""Stable filesystem-safe slug. Deterministic, lowercased, length-capped.
46+
47+
Postcondition: the returned slug never ends with ``.md`` (or any chain
48+
of ``.md`` suffixes). Callers may safely append ``.md`` without
49+
risking ``.md.md``.
50+
"""
3951
if not value:
4052
return "unknown"
4153
cleaned = _SAFE.sub("-", value.strip().lower()).strip("-")
4254
if not cleaned:
4355
return "unknown"
44-
return cleaned[:max_len].rstrip("-") or "unknown"
56+
cleaned = _TRAILING_MD_EXT.sub("", cleaned).rstrip(".-") or "unknown"
57+
return cleaned[:max_len].rstrip("-.") or "unknown"
4558

4659

4760
def file_path_slug(file_path: str) -> str:

tests_py/core/test_wiki_classifier.py

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from __future__ import annotations
44

5-
from mcp_server.core.wiki_classifier import classify_memory
5+
from mcp_server.core.wiki_classifier import classify_memory, derive_title
66

77

88
# ── Audit-tag gate ────────────────────────────────────────────────────
@@ -97,3 +97,76 @@ def test_valid_lesson_admitted() -> None:
9797
"on path again."
9898
)
9999
assert classify_memory(content, tags=["lesson", "bug-fix"]) == "lesson"
100+
101+
102+
# ── derive_title regression tests (bugs found 2026-05-12) ─────────────
103+
104+
105+
def test_derive_title_rejects_yaml_timestamp_line() -> None:
106+
"""Regression — 10 ADRs were slugged ``decision-created-2026-04-15t...``
107+
because the first non-{}/[] line in the body was a YAML ``created:``
108+
timestamp. derive_title must reject metadata key:value lines.
109+
"""
110+
content = (
111+
"created: 2026-04-15T09:29:10Z\n"
112+
"We adopted pgvector with HNSW because benchmarks showed 3x speedup.\n"
113+
)
114+
title = derive_title(content, "adr")
115+
assert "2026-04-15" not in title
116+
assert "created" not in title.lower()
117+
assert "pgvector" in title.lower() or "HNSW" in title
118+
119+
120+
def test_derive_title_rejects_embedded_posix_path() -> None:
121+
"""Regression — pages like ``2026-04-17-also-on-users-cdeust-documents-
122+
developments-...`` were created because the first body line contained
123+
an absolute /Users/ path mid-sentence and the path-as-title regex only
124+
matched start-of-line. derive_title must reject path-embedded lines.
125+
"""
126+
content = (
127+
"also on /Users/cdeust/Documents/Developments/ai-architect-prd-builder "
128+
"the same issue shows up\n"
129+
"The real spec: caching keys must include model SHA.\n"
130+
)
131+
title = derive_title(content, "spec")
132+
assert "users-cdeust" not in title.lower()
133+
assert "/users/" not in title.lower()
134+
135+
136+
def test_derive_title_rejects_windows_path() -> None:
137+
content = (
138+
"see C:\\Users\\dev\\project\\config.toml for details\n"
139+
"Caching policy: include the model SHA in every key.\n"
140+
)
141+
title = derive_title(content, "spec")
142+
# The C:\ line must not appear; the clean second line is used instead.
143+
assert "config.toml" not in title.lower()
144+
assert "\\users\\dev" not in title.lower()
145+
assert "caching" in title.lower()
146+
147+
148+
def test_derive_title_returns_empty_when_no_clean_candidate() -> None:
149+
"""When every candidate line is rejected, return empty so the caller
150+
(wiki_sync) routes to the deterministic ``memory-<hash>`` fallback
151+
instead of inheriting the v3.10.1 ``content[:80]`` raw-fragment leak.
152+
"""
153+
content = "created: 2026-04-15T09:29:10Z\nupdated: 2026-04-15T09:29:11Z\nid: 1828\n"
154+
title = derive_title(content, "adr")
155+
assert title == ""
156+
157+
158+
def test_derive_title_bare_iso_timestamp_rejected() -> None:
159+
content = (
160+
"2026-04-15T09:29:10Z is when this happened\n"
161+
"Decision: adopt pgvector for ANN.\n"
162+
)
163+
title = derive_title(content, "adr")
164+
assert "2026-04-15" not in title
165+
166+
167+
def test_derive_title_still_works_for_clean_input() -> None:
168+
"""Positive control — happy path must continue to work."""
169+
content = "Use pgvector for retrieval — IVFFlat is too slow at our scale."
170+
title = derive_title(content, "adr")
171+
assert "pgvector" in title.lower()
172+
assert title.startswith("Decision:")

tests_py/core/test_wiki_layout.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,34 @@ def test_slugify_empty() -> None:
3030
assert slugify("!!!") == "unknown"
3131

3232

33+
def test_slugify_strips_trailing_md_extension() -> None:
34+
"""Regression — bug found 2026-05-12.
35+
36+
Inputs that already look like .md filenames produced .md.md pages
37+
because every wiki callsite appends ``.md`` to the slug. 58 pages
38+
in the wiki had this shape (e.g. ``2234-decision-001-zero-
39+
dependencies.md.md``). Slug must never end in ``.md``.
40+
"""
41+
assert slugify("001-zero-dependencies.md") == "001-zero-dependencies"
42+
assert slugify("foo.md") == "foo"
43+
# Iterated md chain — should collapse to base.
44+
assert slugify("foo.md.md") == "foo"
45+
assert slugify("foo.md.md.md") == "foo"
46+
47+
48+
def test_slugify_preserves_non_md_extensions() -> None:
49+
"""``file_path_slug`` callers depend on .py/.ts/etc. surviving."""
50+
assert slugify("login.py") == "login.py"
51+
assert slugify("config.yaml") == "config.yaml"
52+
53+
54+
def test_adr_filename_no_double_md() -> None:
55+
"""End-to-end: title that contains '.md' must not yield .md.md."""
56+
slug = slugify("001-zero-dependencies.md")
57+
assert adr_filename(2234, slug) == "2234-001-zero-dependencies.md"
58+
assert not adr_filename(2234, slug).endswith(".md.md")
59+
60+
3361
def test_file_path_slug() -> None:
3462
assert (
3563
file_path_slug("mcp_server/handlers/wiki_write.py")

0 commit comments

Comments
 (0)