Skip to content
This repository was archived by the owner on Mar 11, 2026. It is now read-only.

Commit 7a29acf

Browse files
committed
refactor: simplify wiki sync link segmentation
1 parent 4b0e254 commit 7a29acf

2 files changed

Lines changed: 105 additions & 78 deletions

File tree

scripts/sync_docs_to_wiki.py

Lines changed: 71 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ class MarkdownLink:
8484
suffix: str
8585

8686

87+
@dataclass
88+
class Segment:
89+
kind: str
90+
text: str
91+
92+
8793
def repo_root() -> Path:
8894
return Path(__file__).resolve().parents[1]
8995

@@ -217,19 +223,6 @@ def parse_doc_target(target: str) -> tuple[str, str] | None:
217223
return base_target, anchor
218224

219225

220-
def find_candidates_by_suffix(
221-
language: str, suffix: str, source_pages: tuple[str, ...]
222-
) -> list[str]:
223-
prefix = f"{language}/"
224-
full_suffix = f"{language}/{suffix}"
225-
return [
226-
page
227-
for page in source_pages
228-
if page.startswith(prefix)
229-
and (page == full_suffix or page.endswith(f"/{suffix}"))
230-
]
231-
232-
233226
def find_existing_source_path(
234227
candidate: PurePosixPath,
235228
source_root: Path,
@@ -248,7 +241,14 @@ def find_existing_source_path(
248241
if not suffix:
249242
return ResolutionResult(resolved_path=None)
250243

251-
matches = find_candidates_by_suffix(language, suffix, source_pages)
244+
prefix = f"{language}/"
245+
full_suffix = f"{language}/{suffix}"
246+
matches = [
247+
page
248+
for page in source_pages
249+
if page.startswith(prefix)
250+
and (page == full_suffix or page.endswith(f"/{suffix}"))
251+
]
252252
if len(matches) == 1:
253253
return ResolutionResult(resolved_path=matches[0])
254254
if len(matches) > 1:
@@ -292,30 +292,30 @@ def __init__(self, source_root: Path):
292292
self.source_root = Path(source_root)
293293
self.source_pages = discover_source_pages(str(self.source_root))
294294

295-
def resolve(self, target: str, source_path: str) -> ResolutionResult:
296-
parsed_target = parse_doc_target(target)
297-
if parsed_target is None:
298-
return ResolutionResult(resolved_path=None)
299-
300-
base_target, _ = parsed_target
295+
def resolve_base_target(
296+
self, base_target: str, source_path: str
297+
) -> ResolutionResult:
301298
return resolve_link_path(
302299
base_target=base_target,
303300
source_path=source_path,
304301
source_root=self.source_root,
305302
source_pages=self.source_pages,
306303
)
307304

308-
def resolve_path(self, target: str, source_path: str) -> str | None:
309-
return self.resolve(target, source_path).resolved_path
305+
def resolve_markdown_target(
306+
self, target: str, source_path: str
307+
) -> tuple[str | None, str]:
308+
parsed_target = parse_doc_target(target)
309+
if parsed_target is None:
310+
return None, ""
310311

312+
base_target, anchor = parsed_target
313+
result = self.resolve_base_target(base_target, source_path)
314+
return result.resolved_path, anchor
311315

312-
def rewrite_link_target(target: str, source_path: str, resolver: LinkResolver) -> str:
313-
parsed_target = parse_doc_target(target)
314-
if parsed_target is None:
315-
return target
316316

317-
base_target, anchor = parsed_target
318-
resolved = resolver.resolve_path(base_target, source_path)
317+
def rewrite_link_target(target: str, source_path: str, resolver: LinkResolver) -> str:
318+
resolved, anchor = resolver.resolve_markdown_target(target, source_path)
319319
if resolved is None:
320320
return target
321321

@@ -343,55 +343,55 @@ def rewrite_links_in_segment(
343343
return "".join(result)
344344

345345

346-
def rewrite_links(
347-
content: str,
348-
source_path: str,
349-
resolver: LinkResolver,
350-
) -> str:
351-
parts: list[tuple[str, str]] = []
346+
def iter_segments(content: str):
352347
last_end = 0
353-
354-
for fenced_match in FENCED_BLOCK_RE.finditer(content):
355-
before = content[last_end : fenced_match.start()]
348+
for fenced in FENCED_BLOCK_RE.finditer(content):
349+
before = content[last_end : fenced.start()]
356350
if before:
357-
parts.append(("text", before))
358-
parts.append(("code", fenced_match.group(0)))
359-
last_end = fenced_match.end()
351+
last_inline_end = 0
352+
for inline in INLINE_CODE_RE.finditer(before):
353+
if inline.start() > last_inline_end:
354+
yield Segment("text", before[last_inline_end : inline.start()])
355+
yield Segment("inline_code", inline.group(0))
356+
last_inline_end = inline.end()
357+
if last_inline_end < len(before):
358+
yield Segment("text", before[last_inline_end:])
359+
360+
yield Segment("code_block", fenced.group(0))
361+
last_end = fenced.end()
360362

361363
tail = content[last_end:]
362-
if tail:
363-
parts.append(("text", tail))
364-
365-
output: list[str] = []
366-
for kind, chunk in parts:
367-
if kind == "code":
368-
output.append(chunk)
369-
continue
364+
if not tail:
365+
return
370366

371-
last_inline_end = 0
372-
for inline_match in INLINE_CODE_RE.finditer(chunk):
373-
before_inline = chunk[last_inline_end : inline_match.start()]
374-
if before_inline:
375-
output.append(
376-
rewrite_links_in_segment(
377-
before_inline,
378-
source_path=source_path,
379-
resolver=resolver,
380-
)
381-
)
367+
last_inline_end = 0
368+
for inline in INLINE_CODE_RE.finditer(tail):
369+
if inline.start() > last_inline_end:
370+
yield Segment("text", tail[last_inline_end : inline.start()])
371+
yield Segment("inline_code", inline.group(0))
372+
last_inline_end = inline.end()
373+
if last_inline_end < len(tail):
374+
yield Segment("text", tail[last_inline_end:])
382375

383-
output.append(inline_match.group(0))
384-
last_inline_end = inline_match.end()
385376

386-
after_inline = chunk[last_inline_end:]
387-
if after_inline:
377+
def rewrite_links(
378+
content: str,
379+
source_path: str,
380+
resolver: LinkResolver,
381+
) -> str:
382+
output: list[str] = []
383+
for segment in iter_segments(content):
384+
if segment.kind == "text":
388385
output.append(
389386
rewrite_links_in_segment(
390-
after_inline,
387+
segment.text,
391388
source_path=source_path,
392389
resolver=resolver,
393390
)
394391
)
392+
continue
393+
394+
output.append(segment.text)
395395

396396
return "".join(output)
397397

@@ -404,12 +404,16 @@ def find_unresolved_doc_links(source_root: Path) -> list[str]:
404404
for source_path in resolver.source_pages:
405405
content = (root / source_path).read_text(encoding="utf-8")
406406
for link in iter_markdown_links(content):
407+
resolved_path, _ = resolver.resolve_markdown_target(
408+
link.target, source_path
409+
)
410+
if resolved_path is not None:
411+
continue
407412
parsed_target = parse_doc_target(link.target)
408413
if parsed_target is None:
409414
continue
410-
resolution = resolver.resolve(link.target, source_path)
411-
if resolution.resolved_path is not None:
412-
continue
415+
base_target, _ = parsed_target
416+
resolution = resolver.resolve_base_target(base_target, source_path)
413417
if resolution.ambiguous_matches:
414418
unresolved.append(
415419
f"{source_path} -> {link.target} (ambiguous: {', '.join(resolution.ambiguous_matches)})",

tests/test_sync_docs_to_wiki.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,16 @@ def test_module_does_not_expose_removed_wrapper_helpers(self):
4141
self.assertFalse(hasattr(module, "resolve_source_path"))
4242
self.assertFalse(hasattr(module, "compute_managed_files"))
4343
self.assertFalse(hasattr(module, "MANAGED_FILENAMES"))
44+
self.assertFalse(hasattr(module, "find_candidates_by_suffix"))
4445

4546
def test_module_exposes_consolidated_helper_names(self):
4647
module = load_sync_module()
4748

4849
self.assertTrue(hasattr(module, "prepare_candidate_path"))
49-
self.assertTrue(hasattr(module, "find_candidates_by_suffix"))
5050
self.assertTrue(hasattr(module, "resolve_link_path"))
5151
self.assertTrue(hasattr(module, "LANG_CONFIG"))
52+
self.assertTrue(hasattr(module, "Segment"))
53+
self.assertTrue(hasattr(module, "iter_segments"))
5254

5355
def test_parse_doc_target_returns_base_and_anchor(self):
5456
module = load_sync_module()
@@ -68,6 +70,26 @@ def test_iter_markdown_links_handles_whitespace_before_target(self):
6870

6971
self.assertEqual([link.target for link in links], ["guide.md"])
7072

73+
def test_iter_segments_splits_text_inline_and_fenced_code(self):
74+
module = load_sync_module()
75+
76+
segments = list(
77+
module.iter_segments(
78+
"Start [Guide](/guide) `code [Guide](/guide)`\n\n```md\n[Guide](/guide)\n```\nTail\n"
79+
)
80+
)
81+
82+
self.assertEqual(
83+
[(segment.kind, segment.text) for segment in segments],
84+
[
85+
("text", "Start [Guide](/guide) "),
86+
("inline_code", "`code [Guide](/guide)`"),
87+
("text", "\n\n"),
88+
("code_block", "```md\n[Guide](/guide)\n```"),
89+
("text", "\nTail\n"),
90+
],
91+
)
92+
7193
def test_rewrite_links_handles_absolute_same_language_links(self):
7294
module = load_sync_module()
7395

@@ -221,8 +243,8 @@ def test_link_resolver_resolves_source_paths(self):
221243
resolver = module.LinkResolver(source_root)
222244

223245
self.assertEqual(
224-
resolver.resolve_path("/deploy/guide", "zh/index.md"),
225-
"zh/deploy/guide.md",
246+
resolver.resolve_markdown_target("/deploy/guide#intro", "zh/index.md"),
247+
("zh/deploy/guide.md", "#intro"),
226248
)
227249

228250
def test_resolve_link_path_resolves_relative_target(self):
@@ -271,21 +293,21 @@ def test_prepare_candidate_path_normalizes_suffix_and_alias(self):
271293
module.PurePosixPath("zh/providers/start.md"),
272294
)
273295

274-
def test_find_candidates_by_suffix_matches_language_bounded_suffixes(self):
296+
def test_find_existing_source_path_matches_language_bounded_suffixes(self):
275297
module = load_sync_module()
276298

277299
self.assertEqual(
278-
module.find_candidates_by_suffix(
279-
language="zh",
280-
suffix="bar/guide.md",
300+
module.find_existing_source_path(
301+
candidate=module.PurePosixPath("zh/bar/guide.md"),
302+
source_root=Path("/tmp/nonexistent"),
281303
source_pages=(
282304
"zh/bar/guide.md",
283305
"zh/foo/bar/guide.md",
284306
"zh/foobar/guide.md",
285307
"en/bar/guide.md",
286308
),
287-
),
288-
["zh/bar/guide.md", "zh/foo/bar/guide.md"],
309+
).ambiguous_matches,
310+
("zh/bar/guide.md", "zh/foo/bar/guide.md"),
289311
)
290312

291313
def test_build_page_info_returns_page_info_dataclass(self):
@@ -436,8 +458,9 @@ def test_resolver_does_not_match_partial_path_segments(self):
436458

437459
resolver = module.LinkResolver(source_root)
438460

439-
self.assertIsNone(
440-
resolver.resolve_path("/bar/guide", "zh/index.md"),
461+
self.assertEqual(
462+
resolver.resolve_markdown_target("/bar/guide", "zh/index.md"),
463+
(None, ""),
441464
)
442465

443466
def test_live_docs_have_no_unresolved_internal_doc_links(self):

0 commit comments

Comments
 (0)