Skip to content

Commit a38141e

Browse files
Refs/heads/more cleanup (#3)
* Build script, fix TOC, add live HTML comparison - Add scripts/Build-Publish.ps1 for local build (mirrors workflow) - Fix convert-and-publish.yml: use protocol-named .md, copy as index.md - Update-OpenSpecIndex: use catalog titles, improve markdown fallback - Get-OpenSpecCatalog: parse Description column for richer TOC - Add Compare-OpenSpecToLiveHtml for conversion vs live HTML review Co-authored-by: Cursor <cursoragent@cursor.com> * Add conversion quality analysis, fix Compare-OpenSpecToLiveHtml - Add scripts/Analyze-ConversionQuality.ps1 for conversion + live HTML review - Fix Compare-OpenSpecToLiveHtml: null-safe path, liveHeadings index, per-protocol try-catch - Add CONVERSION-QUALITY-ANALYSIS.md with full analysis results Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 3161ceb commit a38141e

8 files changed

Lines changed: 492 additions & 22 deletions

.github/workflows/convert-and-publish.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,12 @@ jobs:
4343
New-Item -Path $publish -ItemType Directory -Force | Out-Null
4444
Get-ChildItem -LiteralPath $converted -Directory | ForEach-Object {
4545
$name = $_.Name
46-
$md = Join-Path $_.FullName 'index.md'
46+
$md = Join-Path $_.FullName "$name.md"
47+
if (-not (Test-Path -LiteralPath $md)) { $md = Join-Path $_.FullName 'index.md' }
4748
if (-not (Test-Path -LiteralPath $md)) { return }
4849
$dest = Join-Path $publish $name
4950
New-Item -Path $dest -ItemType Directory -Force | Out-Null
50-
Copy-Item -LiteralPath $md -Destination $dest -Force
51+
Copy-Item -LiteralPath $md -Destination (Join-Path $dest 'index.md') -Force
5152
$media = Join-Path $_.FullName 'media'
5253
if (Test-Path -LiteralPath $media -PathType Container) {
5354
Copy-Item -LiteralPath $media -Destination $dest -Recurse -Force

AwakeCoding.OpenSpecs/AwakeCoding.OpenSpecs.psd1

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
'Invoke-OpenSpecConversionPipeline',
2020
'Get-OpenSpecConversionReport',
2121
'Test-OpenSpecMarkdownFidelity',
22-
'Update-OpenSpecIndex'
22+
'Update-OpenSpecIndex',
23+
'Compare-OpenSpecToLiveHtml'
2324
)
2425
CmdletsToExport = @()
2526
VariablesToExport = @()
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
function Compare-OpenSpecToLiveHtml {
2+
<#
3+
.SYNOPSIS
4+
Compares converted markdown structure to the live HTML spec page on learn.microsoft.com.
5+
.DESCRIPTION
6+
For each converted spec, fetches the live spec page HTML and extracts its structure
7+
(headings, section IDs). Compares with the converted markdown structure and reports
8+
missing sections, heading/ID mismatches, and suggested manual review items.
9+
#>
10+
[CmdletBinding()]
11+
param(
12+
[string]$OutputPath = (Join-Path -Path (Get-Location) -ChildPath 'converted-specs'),
13+
14+
[string[]]$ProtocolId
15+
)
16+
17+
if (-not (Test-Path -LiteralPath $OutputPath)) {
18+
throw "Output path '$OutputPath' was not found."
19+
}
20+
21+
$catalog = @{}
22+
try {
23+
foreach ($e in (Get-OpenSpecCatalog)) {
24+
$catalog[$e.ProtocolId] = $e
25+
$norm = $e.ProtocolId -replace '-', '_'
26+
if (-not $catalog[$norm]) { $catalog[$norm] = $e }
27+
}
28+
}
29+
catch {
30+
Write-Warning "Could not fetch catalog: $($_.Exception.Message)"
31+
}
32+
33+
$reports = Get-OpenSpecConversionReport -OutputPath $OutputPath -ProtocolId $ProtocolId
34+
$results = New-Object System.Collections.Generic.List[object]
35+
36+
foreach ($report in $reports) {
37+
try {
38+
$protocolId = $report.ProtocolId
39+
if ([string]::IsNullOrWhiteSpace($protocolId)) { continue }
40+
$mdPath = $report.MarkdownPath
41+
if ([string]::IsNullOrWhiteSpace($mdPath)) {
42+
$mdPath = Join-Path (Join-Path $OutputPath $protocolId) "$protocolId.md"
43+
}
44+
$markdown = ''
45+
if ($mdPath -and (Test-Path -LiteralPath $mdPath -PathType Leaf -ErrorAction SilentlyContinue)) {
46+
$markdown = Get-Content -LiteralPath $mdPath -Raw -ErrorAction SilentlyContinue
47+
}
48+
49+
$mdHeadings = New-Object System.Collections.Generic.List[object]
50+
$mdAnchors = New-Object System.Collections.Generic.HashSet[string]([System.StringComparer]::OrdinalIgnoreCase)
51+
$mdHeadingRegex = [regex]::new('(?m)^(#{1,6})\s+(.+)$')
52+
foreach ($m in $mdHeadingRegex.Matches($markdown)) {
53+
$level = $m.Groups[1].Value.Length
54+
$text = $m.Groups[2].Value.Trim()
55+
[void]$mdHeadings.Add([pscustomobject]@{ Level = $level; Text = $text })
56+
}
57+
foreach ($m in [regex]::Matches($markdown, '<a\s+id="([^"]+)"\s*>\s*</a>')) {
58+
[void]$mdAnchors.Add($m.Groups[1].Value)
59+
}
60+
61+
$liveHeadings = New-Object System.Collections.Generic.List[object]
62+
$liveAnchors = New-Object System.Collections.Generic.HashSet[string]([System.StringComparer]::OrdinalIgnoreCase)
63+
$liveUrl = $null
64+
$fetchError = $null
65+
66+
$entry = $catalog[$protocolId]
67+
if ($entry) {
68+
$liveUrl = $entry.SpecPageUrl
69+
try {
70+
$response = Invoke-OpenSpecRequest -Uri $liveUrl
71+
$html = $response.Content
72+
$hRegex = [regex]::new('(?is)<h([1-6])(?:\s[^>]*)?\s+id="([^"]+)"[^>]*>([^<]*)</h\1>')
73+
foreach ($m in $hRegex.Matches($html)) {
74+
$level = [int]$m.Groups[1].Value
75+
$id = $m.Groups[2].Value
76+
$text = (ConvertFrom-OpenSpecHtml -Html $m.Groups[3].Value).Trim()
77+
[void]$liveHeadings.Add([pscustomobject]@{ Level = $level; Id = $id; Text = $text })
78+
[void]$liveAnchors.Add($id)
79+
}
80+
$altHRegex = [regex]::new('(?is)<h([1-6])[^>]*>([^<]*)</h\1>')
81+
foreach ($m in $altHRegex.Matches($html)) {
82+
$level = [int]$m.Groups[1].Value
83+
$text = (ConvertFrom-OpenSpecHtml -Html $m.Groups[2].Value).Trim()
84+
$lastText = if ($liveHeadings.Count -gt 0) { $liveHeadings[$liveHeadings.Count - 1].Text } else { $null }
85+
if ($text.Length -gt 0 -and $text -ne $lastText) {
86+
[void]$liveHeadings.Add([pscustomobject]@{ Level = $level; Id = $null; Text = $text })
87+
}
88+
}
89+
}
90+
catch {
91+
$fetchError = $_.Exception.Message
92+
}
93+
}
94+
else {
95+
$fetchError = "Protocol not found in catalog"
96+
}
97+
98+
$missingInMd = New-Object System.Collections.Generic.List[string]
99+
$missingInLive = New-Object System.Collections.Generic.List[string]
100+
$mdAnchorList = @($mdAnchors)
101+
foreach ($aid in $mdAnchorList) {
102+
if ($aid -notmatch '^_Toc\d+$' -and $liveAnchors.Count -gt 0 -and -not $liveAnchors.Contains($aid)) {
103+
[void]$missingInLive.Add($aid)
104+
}
105+
}
106+
foreach ($aid in @($liveAnchors)) {
107+
if (-not $mdAnchors.Contains($aid)) {
108+
[void]$missingInMd.Add($aid)
109+
}
110+
}
111+
112+
$suggestReview = $false
113+
if ($fetchError) { $suggestReview = $true }
114+
if ($missingInMd.Count -gt 5 -or $missingInLive.Count -gt 5) { $suggestReview = $true }
115+
if ($liveHeadings.Count -eq 0 -and $mdHeadings.Count -gt 0) { $suggestReview = $true }
116+
117+
[void]$results.Add([pscustomobject]@{
118+
PSTypeName = 'AwakeCoding.OpenSpecs.LiveHtmlCompareResult'
119+
ProtocolId = $protocolId
120+
MarkdownPath = $mdPath
121+
LiveUrl = $liveUrl
122+
FetchError = $fetchError
123+
MarkdownHeadingCount = $mdHeadings.Count
124+
MarkdownAnchorCount = $mdAnchors.Count
125+
LiveHeadingCount = $liveHeadings.Count
126+
LiveAnchorCount = $liveAnchors.Count
127+
MissingInMarkdown = @($missingInMd)
128+
MissingInLive = @($missingInLive)
129+
SuggestManualReview = $suggestReview
130+
IssueCount = $report.IssueCount
131+
})
132+
} catch {
133+
Write-Warning "Compare failed for $($report.ProtocolId): $_"
134+
[void]$results.Add([pscustomobject]@{
135+
PSTypeName = 'AwakeCoding.OpenSpecs.LiveHtmlCompareResult'
136+
ProtocolId = $report.ProtocolId
137+
FetchError = $_.Exception.Message
138+
SuggestManualReview = $true
139+
})
140+
}
141+
}
142+
143+
$results
144+
}

AwakeCoding.OpenSpecs/Public/Get-OpenSpecCatalog.ps1

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ function Get-OpenSpecCatalog {
1212
'(?is)<a\b[^>]*href\s*=\s*["''](?<href>\.\./(?<slug>(?:ms|mc)-[a-z0-9-]+)/(?<guid>[0-9a-f-]{36}))(?:["''][^>]*)?>(?<text>.*?)</a>'
1313
)
1414
$idRegex = [regex]::new('\[(?<id>(?:MS|MC)-[A-Z0-9-]+)\]', 'IgnoreCase')
15+
$cellRegex = [regex]::new('(?is)<td[^>]*>(?<content>.*?)</td>')
1516

1617
$seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
1718
$entries = New-Object System.Collections.Generic.List[object]
@@ -41,10 +42,17 @@ function Get-OpenSpecCatalog {
4142
$title = $protocolId
4243
}
4344

45+
$description = ''
46+
$cells = [regex]::Matches($rowHtml, $cellRegex)
47+
if ($cells.Count -ge 2) {
48+
$description = (ConvertFrom-OpenSpecHtml -Html $cells[1].Groups['content'].Value).Trim()
49+
}
50+
4451
$entries.Add([pscustomobject]@{
4552
PSTypeName = 'AwakeCoding.OpenSpecs.Entry'
4653
ProtocolId = $protocolId
4754
Title = $title
55+
Description = $description
4856
SpecPageUrl = $specPageUrl
4957
Slug = $slug
5058
SourcePage = $Uri
@@ -63,6 +71,7 @@ function Get-OpenSpecCatalog {
6371
PSTypeName = 'AwakeCoding.OpenSpecs.Entry'
6472
ProtocolId = $protocolId
6573
Title = $protocolId
74+
Description = ''
6675
SpecPageUrl = "https://learn.microsoft.com/en-us/openspecs/windows_protocols/$($protocolId.ToLowerInvariant())"
6776
Slug = $protocolId.ToLowerInvariant()
6877
SourcePage = $Uri

AwakeCoding.OpenSpecs/Public/Update-OpenSpecIndex.ps1

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,35 @@ function Update-OpenSpecIndex {
22
[CmdletBinding()]
33
param(
44
[Parameter(Mandatory)]
5-
[string]$Path
5+
[string]$Path,
6+
7+
[switch]$UseCatalogTitles = $true,
8+
9+
[switch]$IncludeDescription = $false
610
)
711

812
if (-not (Test-Path -LiteralPath $Path)) {
913
throw "Output directory not found: $Path"
1014
}
1115

16+
$catalogMap = @{}
17+
if ($UseCatalogTitles) {
18+
try {
19+
$catalog = @(Get-OpenSpecCatalog)
20+
foreach ($entry in $catalog) {
21+
$catalogMap[$entry.ProtocolId] = $entry
22+
$normalized = $entry.ProtocolId -replace '-', '_'
23+
if (-not $catalogMap[$normalized]) { $catalogMap[$normalized] = $entry }
24+
}
25+
}
26+
catch {
27+
Write-Warning "Could not fetch catalog for titles: $($_.Exception.Message). Using markdown fallback."
28+
}
29+
}
30+
1231
$specDirs = Get-ChildItem -LiteralPath $Path -Directory | Sort-Object Name
1332
$entries = New-Object System.Collections.Generic.List[pscustomobject]
33+
$boilerplatePatterns = @('Intellectual Property Rights', 'Open Specifications Documentation')
1434

1535
foreach ($dir in $specDirs) {
1636
$specName = $dir.Name
@@ -25,28 +45,50 @@ function Update-OpenSpecIndex {
2545
}
2646

2747
$mdFileName = [System.IO.Path]::GetFileName($mdFile)
28-
29-
# Extract the title from line 3 of the markdown.
30-
# Expected format:
31-
# Line 1: **[MS-RDPECLIP]:**
32-
# Line 2: (blank)
33-
# Line 3: **Remote Desktop Protocol: Clipboard Virtual Channel Extension**
34-
$lines = Get-Content -LiteralPath $mdFile -TotalCount 5 -ErrorAction SilentlyContinue
3548
$title = ''
36-
if ($lines -and $lines.Count -ge 3) {
37-
$rawTitle = $lines[2]
38-
$title = $rawTitle -replace '^\*\*(.+)\*\*$', '$1'
39-
$title = $title.Trim()
49+
$description = ''
50+
51+
$catalogEntry = $catalogMap[$specName]
52+
if ($catalogEntry) {
53+
$title = $catalogEntry.Title
54+
if ($IncludeDescription -and $catalogEntry.Description) {
55+
$description = $catalogEntry.Description
56+
}
57+
}
58+
59+
if ([string]::IsNullOrWhiteSpace($title)) {
60+
$lines = Get-Content -LiteralPath $mdFile -TotalCount 30 -ErrorAction SilentlyContinue
61+
$protocolLabelRegex = [regex]::new('^\*\*\[?(?:MS|MC)-[A-Z0-9-]+\]?\s*:\s*\*\*$', 'IgnoreCase')
62+
$boldLineRegex = [regex]::new('^\*\*(.+)\*\*$')
63+
$foundLabel = $false
64+
foreach ($line in $lines) {
65+
if ($protocolLabelRegex.IsMatch($line.Trim())) {
66+
$foundLabel = $true
67+
continue
68+
}
69+
if ($foundLabel -and $boldLineRegex.IsMatch($line.Trim())) {
70+
$candidate = ($line -replace '^\*\*(.+)\*\*$', '$1').Trim()
71+
$isBoilerplate = $false
72+
foreach ($pat in $boilerplatePatterns) {
73+
if ($candidate -like "*$pat*") { $isBoilerplate = $true; break }
74+
}
75+
if (-not $isBoilerplate -and $candidate.Length -gt 2) {
76+
$title = $candidate
77+
break
78+
}
79+
}
80+
}
4081
}
4182

4283
if ([string]::IsNullOrWhiteSpace($title)) {
4384
$title = $specName
4485
}
4586

4687
[void]$entries.Add([pscustomobject]@{
47-
Name = $specName
48-
Title = $title
49-
Link = "$specName/$mdFileName"
88+
Name = $specName
89+
Title = $title
90+
Description = $description
91+
Link = "$specName/$mdFileName"
5092
})
5193
}
5294

@@ -55,11 +97,21 @@ function Update-OpenSpecIndex {
5597
[void]$sb.AppendLine()
5698
[void]$sb.AppendLine("$($entries.Count) protocol specifications converted to Markdown.")
5799
[void]$sb.AppendLine()
58-
[void]$sb.AppendLine('| Protocol | Title |')
59-
[void]$sb.AppendLine('|---|---|')
60100

61-
foreach ($entry in $entries) {
62-
[void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) |")
101+
if ($IncludeDescription) {
102+
[void]$sb.AppendLine('| Protocol | Title | Description |')
103+
[void]$sb.AppendLine('|---|---|---|')
104+
foreach ($entry in $entries) {
105+
$descEscaped = ($entry.Description -replace '\|', ', ' -replace '\r?\n', ' ').Trim()
106+
[void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) | $descEscaped |")
107+
}
108+
}
109+
else {
110+
[void]$sb.AppendLine('| Protocol | Title |')
111+
[void]$sb.AppendLine('|---|---|')
112+
foreach ($entry in $entries) {
113+
[void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) |")
114+
}
63115
}
64116

65117
$readmePath = Join-Path -Path $Path -ChildPath 'README.md'

0 commit comments

Comments
 (0)