Skip to content

Commit 09b8690

Browse files
committed
initial commit
0 parents  commit 09b8690

25 files changed

Lines changed: 2426 additions & 0 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
artifacts/
2+
downloads/
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
@{
2+
RootModule = 'AwakeCoding.OpenSpecs.psm1'
3+
ModuleVersion = '0.1.0'
4+
GUID = '0d2d7c64-bf31-4f62-9d70-396f6e31b596'
5+
Author = 'AwakeCoding'
6+
CompanyName = 'AwakeCoding'
7+
Copyright = '(c) AwakeCoding. All rights reserved.'
8+
Description = 'Downloads Microsoft Open Specifications Windows Protocol documents by scraping Learn pages for PDF and DOCX links.'
9+
PowerShellVersion = '5.1'
10+
CompatiblePSEditions = @('Desktop', 'Core')
11+
FunctionsToExport = @(
12+
'Get-OpenSpecCatalog',
13+
'Find-OpenSpec',
14+
'Get-OpenSpecVersion',
15+
'Get-OpenSpecDownloadLink',
16+
'Save-OpenSpecDocument',
17+
'Test-OpenSpecDownload',
18+
'Convert-OpenSpecToMarkdown',
19+
'Invoke-OpenSpecConversionPipeline',
20+
'Get-OpenSpecConversionReport',
21+
'Test-OpenSpecMarkdownFidelity'
22+
)
23+
CmdletsToExport = @()
24+
VariablesToExport = @()
25+
AliasesToExport = @()
26+
PrivateData = @{
27+
PSData = @{
28+
Tags = @('OpenSpecs', 'WindowsProtocol', 'Scraping', 'Downloader', 'Markdown', 'Conversion')
29+
ProjectUri = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/MS-WINPROTLP/e36c976a-6263-42a8-b119-7a3cc41ddd2a'
30+
LicenseUri = 'https://learn.microsoft.com/en-us/legal/termsofuse'
31+
}
32+
}
33+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
$privateFunctions = Get-ChildItem -Path (Join-Path -Path $PSScriptRoot -ChildPath 'Private') -Filter '*.ps1' -ErrorAction SilentlyContinue
2+
foreach ($file in $privateFunctions) {
3+
. $file.FullName
4+
}
5+
6+
$publicFunctions = Get-ChildItem -Path (Join-Path -Path $PSScriptRoot -ChildPath 'Public') -Filter '*.ps1' -ErrorAction SilentlyContinue
7+
foreach ($file in $publicFunctions) {
8+
. $file.FullName
9+
}
10+
11+
Export-ModuleMember -Function $publicFunctions.BaseName
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
function ConvertFrom-OpenSpecDocx {
2+
[CmdletBinding()]
3+
param(
4+
[Parameter(Mandatory)]
5+
[string]$InputPath,
6+
7+
[Parameter(Mandatory)]
8+
[string]$OutputPath,
9+
10+
[Parameter(Mandatory)]
11+
[object]$Toolchain
12+
)
13+
14+
if (-not $Toolchain.HasPandoc) {
15+
throw 'pandoc is required for DOCX conversion.'
16+
}
17+
18+
$outputDirectory = Split-Path -Path $OutputPath -Parent
19+
if (-not (Test-Path -LiteralPath $outputDirectory)) {
20+
[void](New-Item -Path $outputDirectory -ItemType Directory -Force)
21+
}
22+
23+
$mediaDirectory = Join-Path -Path $outputDirectory -ChildPath 'assets\media'
24+
if (-not (Test-Path -LiteralPath $mediaDirectory)) {
25+
[void](New-Item -Path $mediaDirectory -ItemType Directory -Force)
26+
}
27+
28+
$arguments = @(
29+
'--from', 'docx',
30+
'--to', 'gfm',
31+
'--wrap=none',
32+
'--extract-media', $mediaDirectory,
33+
'--output', $OutputPath,
34+
$InputPath
35+
)
36+
37+
& $Toolchain.PandocPath @arguments
38+
if ($LASTEXITCODE -ne 0 -or -not (Test-Path -LiteralPath $OutputPath)) {
39+
throw "pandoc conversion failed for '$InputPath'."
40+
}
41+
42+
return [pscustomobject]@{
43+
PSTypeName = 'AwakeCoding.OpenSpecs.ConversionStep'
44+
Strategy = 'pandoc-docx'
45+
OutputPath = $OutputPath
46+
Notes = @()
47+
}
48+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
function ConvertFrom-OpenSpecHtml {
2+
[CmdletBinding()]
3+
param(
4+
[Parameter()]
5+
[AllowEmptyString()]
6+
[string]$Html
7+
)
8+
9+
if ([string]::IsNullOrEmpty($Html)) {
10+
return ''
11+
}
12+
13+
$withoutTags = [regex]::Replace($Html, '<[^>]+>', ' ')
14+
$decoded = [System.Net.WebUtility]::HtmlDecode($withoutTags)
15+
$normalized = [regex]::Replace($decoded, '\s+', ' ')
16+
return $normalized.Trim()
17+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
function ConvertFrom-OpenSpecPdf {
2+
[CmdletBinding()]
3+
param(
4+
[Parameter(Mandatory)]
5+
[string]$InputPath,
6+
7+
[Parameter(Mandatory)]
8+
[string]$OutputPath,
9+
10+
[Parameter(Mandatory)]
11+
[object]$Toolchain
12+
)
13+
14+
$outputDirectory = Split-Path -Path $OutputPath -Parent
15+
if (-not (Test-Path -LiteralPath $outputDirectory)) {
16+
[void](New-Item -Path $outputDirectory -ItemType Directory -Force)
17+
}
18+
19+
$notes = New-Object System.Collections.Generic.List[string]
20+
21+
if ($Toolchain.HasDocling) {
22+
$doclingArguments = @('--to', 'md', '--output', $outputDirectory, $InputPath)
23+
& $Toolchain.DoclingPath @doclingArguments
24+
if ($LASTEXITCODE -eq 0) {
25+
$candidate = Join-Path -Path $outputDirectory -ChildPath ("{0}.md" -f [System.IO.Path]::GetFileNameWithoutExtension($InputPath))
26+
if (Test-Path -LiteralPath $candidate) {
27+
Move-Item -LiteralPath $candidate -Destination $OutputPath -Force
28+
return [pscustomobject]@{
29+
PSTypeName = 'AwakeCoding.OpenSpecs.ConversionStep'
30+
Strategy = 'docling-pdf'
31+
OutputPath = $OutputPath
32+
Notes = @('Converted with docling CLI.')
33+
}
34+
}
35+
}
36+
37+
$notes.Add('docling was detected but did not produce expected markdown output.')
38+
}
39+
40+
if ($Toolchain.HasMarkItDown) {
41+
$markitdownArguments = @($InputPath, '--output', $OutputPath)
42+
& $Toolchain.MarkItDownPath @markitdownArguments
43+
if ($LASTEXITCODE -eq 0 -and (Test-Path -LiteralPath $OutputPath)) {
44+
return [pscustomobject]@{
45+
PSTypeName = 'AwakeCoding.OpenSpecs.ConversionStep'
46+
Strategy = 'markitdown-pdf'
47+
OutputPath = $OutputPath
48+
Notes = @('Converted with markitdown CLI.')
49+
}
50+
}
51+
52+
$notes.Add('markitdown was detected but conversion failed.')
53+
}
54+
55+
if ($Toolchain.HasPandoc) {
56+
$pandocArguments = @('--from', 'pdf', '--to', 'gfm', '--wrap=none', '--output', $OutputPath, $InputPath)
57+
& $Toolchain.PandocPath @pandocArguments
58+
if ($LASTEXITCODE -eq 0 -and (Test-Path -LiteralPath $OutputPath)) {
59+
return [pscustomobject]@{
60+
PSTypeName = 'AwakeCoding.OpenSpecs.ConversionStep'
61+
Strategy = 'pandoc-pdf-fallback'
62+
OutputPath = $OutputPath
63+
Notes = @('Converted with pandoc fallback from PDF.')
64+
}
65+
}
66+
67+
$notes.Add('pandoc fallback from PDF failed.')
68+
}
69+
70+
throw ("Unable to convert PDF '{0}' to Markdown. {1}" -f $InputPath, ($notes -join ' '))
71+
}

0 commit comments

Comments
 (0)