-
-
Notifications
You must be signed in to change notification settings - Fork 31
Expand file tree
/
Copy pathGet-HTMLTables.ps1
More file actions
143 lines (110 loc) · 5.18 KB
/
Get-HTMLTables.ps1
File metadata and controls
143 lines (110 loc) · 5.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
<#
.SYNOPSIS
Extracts tables from an HTML page and returns them as an array of objects.
.PARAMETER URL
The URL of the HTML page to extract tables from.
.PARAMETER TableNumber
The number of the table to extract (starting from 0). If not specified, all tables will be extracted.
.PARAMETER LocalFile
A switch to indicate that the URL parameter is a local file path rather than a web URL
#>
function Get-HTMLTables {
param(
[Parameter(Mandatory)]
[String] $URL,
[Parameter(Mandatory = $false)]
[int] $TableNumber,
[Parameter(Mandatory = $false)]
[boolean] $LocalFile
)
[System.Collections.Generic.List[PSObject]]$tablesArray = @()
if ($LocalFile) {
$html = New-Object -ComObject 'HTMLFile'
$source = Get-Content -Path $URL -Raw
$html.IHTMLDocument2_write($source)
# html does not have ParseHTML because it already an HTMLDocumentClass
# Cast in array in case of only one element
$tables = @($html.getElementsByTagName('TABLE'))
}
else {
$WebRequest = Invoke-WebRequest $URL -UseBasicParsing
# Parse HTML manually using regex since COM objects don't work reliably in PowerShell 7
$htmlContent = $WebRequest.Content
# Extract all table elements using regex
$tableMatches = [regex]::Matches($htmlContent, '<table[^>]*>.*?</table>', [System.Text.RegularExpressions.RegexOptions]::IgnoreCase -bor [System.Text.RegularExpressions.RegexOptions]::Singleline)
# Create mock table objects for compatibility
$tables = @()
foreach ($tableMatch in $tableMatches) {
$tableHtml = $tableMatch.Value
# Create a simple object with Rows property containing the raw HTML
$mockTable = [PSCustomObject]@{
InnerHtml = $tableHtml
Rows = @()
}
# Extract rows using regex
$rowMatches = [regex]::Matches($tableHtml, '<tr[^>]*>.*?</tr>', [System.Text.RegularExpressions.RegexOptions]::IgnoreCase -bor [System.Text.RegularExpressions.RegexOptions]::Singleline)
foreach ($rowMatch in $rowMatches) {
$rowHtml = $rowMatch.Value
$mockRow = [PSCustomObject]@{
InnerHtml = $rowHtml
Cells = @()
}
# Extract cells (th or td)
$cellMatches = [regex]::Matches($rowHtml, '<(th|td)[^>]*>.*?</(th|td)>', [System.Text.RegularExpressions.RegexOptions]::IgnoreCase -bor [System.Text.RegularExpressions.RegexOptions]::Singleline)
foreach ($cellMatch in $cellMatches) {
$cellHtml = $cellMatch.Value
# Check if it's a header cell
$isHeader = $cellHtml -match '^<th'
# Extract text content
$innerText = [regex]::Replace($cellHtml, '<[^>]+>', '') -replace ' ', ' ' -replace '&', '&' -replace '<', '<' -replace '>', '>' -replace '"', '"'
$mockCell = [PSCustomObject]@{
tagName = if ($isHeader) { 'TH' } else { 'TD' }
InnerText = $innerText.Trim()
}
$mockRow.Cells += $mockCell
}
$mockTable.Rows += $mockRow
}
$tables += $mockTable
}
}
## Extract the tables out of the web request
if ($TableNumber) {
#$table = $tables[$TableNumber]
# Cast in array because only one element
$tables = @($tables[$TableNumber])
}
## Go through all of the rows in the table
$tableNumber = 0
foreach ($table in $tables) {
$titles = @()
$rows = @($table.Rows)
$tableNumber++
foreach ($row in $rows) {
$cells = @($row.Cells)
## If we've found a table header, remember its titles
if ($cells[0].tagName -eq 'TH') {
$titles = @($cells | ForEach-Object { ('' + $_.InnerText).Trim() })
continue
}
## If we haven't found any table headers, make up names "P1", "P2", etc.
if (-not $titles) {
$titles = @(1..($cells.Count + 2) | ForEach-Object { "P$_" })
}
## Now go through the cells in the the row. For each, try to find the
## title that represents that column and create a hashtable mapping those
## titles to content
$resultObject = [Ordered] @{
'TableNumber' = $tableNumber
}
for ($counter = 0; $counter -lt $cells.Count; $counter++) {
$title = $titles[$counter]
if (-not $title) { continue }
$resultObject[$title] = ('' + $cells[$counter].InnerText).Trim()
}
## And finally cast that hashtable to a PSCustomObject and add to $array
$tablesArray.Add([PSCustomObject] $resultObject)
}
}
return $tablesArray
}