Skip to content

Commit e789a3d

Browse files
authored
Stabilize section link resolution determinism (#8)
* CI: allow partial publish and list skipped specs * Make section-link repair deterministic and add regression tests * Remove dead private helpers and consolidate cleanup wrappers * Use shared helper in Repair-MissingSectionAnchors * Deduplicate download retry logic * Reuse Invoke-OpenSpecRequest for download retries * Make DOCX fallback order-independent in parallel * Deduplicate per-file logic in Repair-AllBrokenLinks * Fix parallel download helper visibility * Drop PDF source conversion path * Stabilize section link resolution determinism
1 parent bb6d252 commit e789a3d

20 files changed

Lines changed: 600 additions & 341 deletions

AGENTS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,5 @@ When you add a new exported function, add its name to the `$expected` array in t
6060
## Project-specific rules
6161

6262
- Do not remove or rename exported functions without updating `AwakeCoding.OpenSpecs.psd1` and the exports test.
63-
- Conversion: DOCX is handled in-module via OpenXML; PDF uses external `docling` or `markitdown` when available (see `AwakeCoding.OpenSpecs/Private/Get-OpenSpecToolchain.ps1`). Output is textual (tables, ASCII), not image-based.
63+
- Conversion: DOCX is handled in-module via OpenXML. PDF is not used as a conversion source. Output is textual (tables, ASCII), not image-based.
6464
- For bulk or CI conversions, use `-Parallel -ThrottleLimit N` with `Convert-OpenSpecToMarkdown` or `Invoke-OpenSpecConversionPipeline` (PowerShell 7 only).

AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecDocx.ps1

Lines changed: 151 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -263,18 +263,84 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {
263263
}
264264
}
265265
}
266-
$titleToSection = @{}
266+
$getSectionSortKey = {
267+
param([string]$sectionId)
268+
269+
if ($sectionId -match '^Section_(?<num>\d+(?:\.\d+)*)$') {
270+
$parts = @($Matches['num'] -split '\.' | ForEach-Object {
271+
if ($_ -match '^\d+$') { [int]$_ } else { 0 }
272+
})
273+
$padded = @($parts | ForEach-Object { '{0:D8}' -f $_ })
274+
return ('0|' + ($padded -join '.'))
275+
}
276+
277+
return ('1|' + $sectionId.ToLowerInvariant())
278+
}
279+
280+
$sectionEntries = New-Object System.Collections.Generic.List[object]
267281
foreach ($entry in $linkMetadata.SectionToTitle.GetEnumerator()) {
268-
$key = [string]$entry.Key
269-
$val = ([string]$entry.Value -replace '\s+', ' ').Trim()
270-
if (-not [string]::IsNullOrWhiteSpace($val)) {
271-
$titleToSection[$val] = $key
272-
$withoutNum = ($val -replace '^\d+(?:\.\d+)*\s+', '').Trim()
273-
if ($withoutNum -and -not $titleToSection.ContainsKey($withoutNum)) {
274-
$titleToSection[$withoutNum] = $key
282+
$sectionId = [string]$entry.Key
283+
$title = ([string]$entry.Value -replace '\s+', ' ').Trim()
284+
if ([string]::IsNullOrWhiteSpace($sectionId) -or [string]::IsNullOrWhiteSpace($title)) {
285+
continue
286+
}
287+
288+
$titleWithoutNum = ($title -replace '^\d+(?:\.\d+)*\s+', '').Trim()
289+
[void]$sectionEntries.Add([pscustomobject]@{
290+
SectionId = $sectionId
291+
TitleNormalized = $title
292+
TitleWithoutNumber = $titleWithoutNum
293+
SortKey = (& $getSectionSortKey $sectionId)
294+
})
295+
}
296+
297+
$orderedSectionEntries = @($sectionEntries | Sort-Object -Property @(
298+
@{ Expression = { $_.SortKey } },
299+
@{ Expression = { $_.SectionId.ToLowerInvariant() } },
300+
@{ Expression = { $_.TitleNormalized.ToLowerInvariant() } }
301+
))
302+
303+
$sectionIdSet = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
304+
$titleToSections = @{}
305+
$titleWithoutNumToSections = @{}
306+
307+
foreach ($entry in $orderedSectionEntries) {
308+
[void]$sectionIdSet.Add($entry.SectionId)
309+
310+
$titleKey = $entry.TitleNormalized.ToLowerInvariant()
311+
if (-not $titleToSections.ContainsKey($titleKey)) {
312+
$titleToSections[$titleKey] = New-Object System.Collections.Generic.List[string]
313+
}
314+
if (-not $titleToSections[$titleKey].Contains($entry.SectionId)) {
315+
[void]$titleToSections[$titleKey].Add($entry.SectionId)
316+
}
317+
318+
if (-not [string]::IsNullOrWhiteSpace($entry.TitleWithoutNumber)) {
319+
$withoutNumKey = $entry.TitleWithoutNumber.ToLowerInvariant()
320+
if (-not $titleWithoutNumToSections.ContainsKey($withoutNumKey)) {
321+
$titleWithoutNumToSections[$withoutNumKey] = New-Object System.Collections.Generic.List[string]
275322
}
323+
if (-not $titleWithoutNumToSections[$withoutNumKey].Contains($entry.SectionId)) {
324+
[void]$titleWithoutNumToSections[$withoutNumKey].Add($entry.SectionId)
325+
}
326+
}
327+
}
328+
329+
$findUniqueSection = {
330+
param([System.Collections.Generic.List[string]]$candidateSections)
331+
332+
if ($null -eq $candidateSections) {
333+
return $null
334+
}
335+
336+
$candidates = @($candidateSections | Sort-Object)
337+
if ($candidates.Count -eq 1) {
338+
return $candidates[0]
276339
}
340+
341+
return $null
277342
}
343+
278344
$sectionGuidRegex = [regex]::new('^(?:[Ss]ection_)?([a-f0-9]{32})$')
279345
$internalLinksArray = $linkMetadata.InternalHyperlinks.ToArray()
280346
foreach ($link in $internalLinksArray) {
@@ -285,18 +351,86 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {
285351
$guid = $m.Groups[1].Value.ToLowerInvariant()
286352
if ($linkMetadata.GuidToSection.ContainsKey($guid)) { continue }
287353
$matchedSection = $null
288-
if ($titleToSection.ContainsKey($text)) {
289-
$matchedSection = $titleToSection[$text]
354+
355+
if ($text -match '^(?:section\s+)?(?<num>\d+(?:\.\d+)*)$') {
356+
$directSection = "Section_$($Matches['num'])"
357+
if ($sectionIdSet.Contains($directSection)) {
358+
$matchedSection = $directSection
359+
}
290360
}
291-
else {
292-
foreach ($tit in $titleToSection.Keys) {
293-
if ($tit -eq $text) { $matchedSection = $titleToSection[$tit]; break }
294-
$textEsc = [Management.Automation.WildcardPattern]::Escape($text)
295-
$titEsc = [Management.Automation.WildcardPattern]::Escape($tit)
296-
if ($tit -like "*$textEsc*" -and $text.Length -ge 8) { $matchedSection = $titleToSection[$tit]; break }
297-
if ($text -like "*$titEsc*" -and $tit.Length -ge 8) { $matchedSection = $titleToSection[$tit]; break }
361+
362+
$textKey = $text.ToLowerInvariant()
363+
if (-not $matchedSection -and $titleToSections.ContainsKey($textKey)) {
364+
$matchedSection = & $findUniqueSection $titleToSections[$textKey]
365+
}
366+
367+
$textWithoutNum = ($text -replace '^\d+(?:\.\d+)*\s+', '').Trim()
368+
if (-not $matchedSection -and -not [string]::IsNullOrWhiteSpace($textWithoutNum)) {
369+
$textWithoutNumKey = $textWithoutNum.ToLowerInvariant()
370+
if ($titleWithoutNumToSections.ContainsKey($textWithoutNumKey)) {
371+
$matchedSection = & $findUniqueSection $titleWithoutNumToSections[$textWithoutNumKey]
298372
}
299373
}
374+
375+
if (-not $matchedSection -and $text.Length -ge 8) {
376+
$fuzzyCandidates = New-Object System.Collections.Generic.List[object]
377+
foreach ($entry in $orderedSectionEntries) {
378+
$candidateTitle = $entry.TitleNormalized
379+
if ([string]::IsNullOrWhiteSpace($candidateTitle) -or $candidateTitle.Length -lt 8) {
380+
continue
381+
}
382+
383+
$containsText = $candidateTitle.IndexOf($text, [System.StringComparison]::OrdinalIgnoreCase) -ge 0
384+
$containsCandidate = $text.IndexOf($candidateTitle, [System.StringComparison]::OrdinalIgnoreCase) -ge 0
385+
if (-not $containsText -and -not $containsCandidate) {
386+
continue
387+
}
388+
389+
$score = if ($containsText -and $containsCandidate) {
390+
0
391+
}
392+
elseif ($containsText) {
393+
1
394+
}
395+
else {
396+
2
397+
}
398+
399+
[void]$fuzzyCandidates.Add([pscustomobject]@{
400+
Score = $score
401+
LengthDelta = [Math]::Abs($candidateTitle.Length - $text.Length)
402+
SortKey = $entry.SortKey
403+
SectionId = $entry.SectionId
404+
})
405+
}
406+
407+
if ($fuzzyCandidates.Count -gt 0) {
408+
$orderedCandidates = @($fuzzyCandidates | Sort-Object -Property @(
409+
@{ Expression = { $_.Score } },
410+
@{ Expression = { $_.LengthDelta } },
411+
@{ Expression = { $_.SortKey } },
412+
@{ Expression = { $_.SectionId } }
413+
))
414+
415+
$best = $orderedCandidates[0]
416+
$isUniqueBest = $true
417+
if ($orderedCandidates.Count -gt 1) {
418+
$second = $orderedCandidates[1]
419+
if (
420+
$second.Score -eq $best.Score -and
421+
$second.LengthDelta -eq $best.LengthDelta -and
422+
$second.SortKey -eq $best.SortKey
423+
) {
424+
$isUniqueBest = $false
425+
}
426+
}
427+
428+
if ($isUniqueBest) {
429+
$matchedSection = $best.SectionId
430+
}
431+
}
432+
}
433+
300434
if ($matchedSection) {
301435
$linkMetadata.GuidToSection[$guid] = $matchedSection
302436
}
@@ -912,26 +1046,6 @@ function Get-OpenSpecOpenXmlParagraphAnchorInfo {
9121046
}
9131047
}
9141048

915-
function Get-OpenSpecOpenXmlParagraphAnchors {
916-
[CmdletBinding()]
917-
param(
918-
[Parameter(Mandatory)]
919-
[System.Xml.XmlNode]$ParagraphNode,
920-
921-
[Parameter(Mandatory)]
922-
[System.Xml.XmlNamespaceManager]$NamespaceManager,
923-
924-
[Parameter()]
925-
[string]$ParagraphText,
926-
927-
[Parameter()]
928-
[string]$HeadingStyle
929-
)
930-
931-
$info = Get-OpenSpecOpenXmlParagraphAnchorInfo -ParagraphNode $ParagraphNode -NamespaceManager $NamespaceManager -ParagraphText $ParagraphText -HeadingStyle $HeadingStyle
932-
return @($info.Anchors)
933-
}
934-
9351049
function Get-OpenSpecOpenXmlParagraphInternalHyperlinks {
9361050
[CmdletBinding()]
9371051
param(

AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecPdf.ps1

Lines changed: 0 additions & 56 deletions
This file was deleted.

AwakeCoding.OpenSpecs/Private/Get-OpenSpecGuidSectionMapFromLearn.ps1

Lines changed: 0 additions & 69 deletions
This file was deleted.

AwakeCoding.OpenSpecs/Private/Get-OpenSpecToolchain.ps1

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
function Get-OpenSpecToolchain {
22
[CmdletBinding()]
33
param(
4-
[switch]$RequirePdfConverter,
54
[switch]$RequireDocxConverter
65
)
76

@@ -22,10 +21,6 @@ function Get-OpenSpecToolchain {
2221
HasOpenXml = $null -ne $openXmlModule
2322
}
2423

25-
if ($RequirePdfConverter -and -not ($toolchain.HasDocling -or $toolchain.HasMarkItDown)) {
26-
throw 'No PDF converter detected. Install docling or markitdown.'
27-
}
28-
2924
if ($RequireDocxConverter -and -not $toolchain.HasOpenXml) {
3025
throw 'No DOCX converter detected. Install the OpenXML PowerShell module.'
3126
}

0 commit comments

Comments
 (0)