Skip to content

Commit 6c03cb8

Browse files
authored
Merge pull request #315 from githubnext/autoloop/build-tsb-pandas-typescript-migration
[Autoloop: build-tsb-pandas-typescript-migration]
2 parents abf504e + 052509e commit 6c03cb8

8 files changed

Lines changed: 984 additions & 0 deletions

File tree

playground/index.html

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,11 @@ <h3><a href="format_table.html" style="color: var(--accent); text-decoration: no
496496
<p>toMarkdown() and toLaTeX() — render DataFrames and Series as Markdown tables or LaTeX tabular environments. Mirrors pandas.DataFrame.to_markdown() and to_latex().</p>
497497
<div class="status done">✅ Complete</div>
498498
</div>
499+
<div class="feature-card">
500+
<h3><a href="read_html.html" style="color: var(--accent); text-decoration: none;">🌐 readHtml — pd.read_html()</a></h3>
501+
<p>readHtml(html, opts?) — parse HTML tables into DataFrames. Header detection, NA handling, numeric coercion, thousands/decimal separators, indexCol, match filter. Mirrors pandas.read_html().</p>
502+
<div class="status done">✅ Complete</div>
503+
</div>
499504
</div>
500505
<div class="features-grid">
501506
<div class="feature-card">

playground/read_html.html

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
6+
<title>tsb – readHtml() playground</title>
7+
<style>
8+
body { font-family: system-ui, sans-serif; max-width: 860px; margin: 2rem auto; padding: 0 1rem; }
9+
h1 { color: #2563eb; }
10+
h2 { color: #1e40af; margin-top: 2rem; }
11+
textarea { width: 100%; font-family: monospace; font-size: 13px; border: 1px solid #d1d5db; border-radius: 6px; padding: 8px; box-sizing: border-box; }
12+
pre { background: #f1f5f9; padding: 1rem; border-radius: 6px; overflow-x: auto; font-size: 13px; }
13+
button { background: #2563eb; color: white; border: none; border-radius: 6px; padding: 8px 18px; cursor: pointer; font-size: 14px; }
14+
button:hover { background: #1d4ed8; }
15+
.output { margin-top: 1rem; }
16+
table { border-collapse: collapse; width: 100%; margin-top: 0.5rem; }
17+
th, td { border: 1px solid #d1d5db; padding: 6px 12px; text-align: left; font-size: 13px; }
18+
th { background: #f1f5f9; font-weight: 600; }
19+
.example-list li { margin-bottom: 0.4rem; }
20+
</style>
21+
</head>
22+
<body>
23+
<h1>🐼 tsb – <code>readHtml()</code></h1>
24+
<p>
25+
<code>readHtml(html, opts?)</code> mirrors <a href="https://pandas.pydata.org/docs/reference/api/pandas.read_html.html" target="_blank"><code>pandas.read_html()</code></a>.
26+
It scans an HTML string for <code>&lt;table&gt;</code> elements and returns one <strong>DataFrame</strong> per table found.
27+
</p>
28+
29+
<h2>Live Demo</h2>
30+
<p>Paste or edit HTML below, then click <strong>Parse</strong>.</p>
31+
<textarea id="html-input" rows="14">
32+
&lt;table&gt;
33+
&lt;thead&gt;
34+
&lt;tr&gt;&lt;th&gt;Country&lt;/th&gt;&lt;th&gt;Population (M)&lt;/th&gt;&lt;th&gt;GDP (B USD)&lt;/th&gt;&lt;/tr&gt;
35+
&lt;/thead&gt;
36+
&lt;tbody&gt;
37+
&lt;tr&gt;&lt;td&gt;USA&lt;/td&gt;&lt;td&gt;331&lt;/td&gt;&lt;td&gt;23000&lt;/td&gt;&lt;/tr&gt;
38+
&lt;tr&gt;&lt;td&gt;China&lt;/td&gt;&lt;td&gt;1411&lt;/td&gt;&lt;td&gt;17700&lt;/td&gt;&lt;/tr&gt;
39+
&lt;tr&gt;&lt;td&gt;Germany&lt;/td&gt;&lt;td&gt;84&lt;/td&gt;&lt;td&gt;4500&lt;/td&gt;&lt;/tr&gt;
40+
&lt;/tbody&gt;
41+
&lt;/table&gt;
42+
</textarea>
43+
<br /><br />
44+
<label>header row index: <input id="header-input" type="number" value="0" style="width:60px" /></label>
45+
&nbsp;
46+
<label>converters (numeric): <input id="converters-input" type="checkbox" checked /></label>
47+
&nbsp;
48+
<label>nrows: <input id="nrows-input" type="number" placeholder="all" style="width:60px" /></label>
49+
<br /><br />
50+
<button onclick="run()">▶ Parse</button>
51+
52+
<div id="output" class="output"></div>
53+
54+
<h2>Code Example</h2>
55+
<pre><code>import { readHtml } from "tsb";
56+
57+
const html = `&lt;table&gt;
58+
&lt;thead&gt;&lt;tr&gt;&lt;th&gt;Name&lt;/th&gt;&lt;th&gt;Age&lt;/th&gt;&lt;/tr&gt;&lt;/thead&gt;
59+
&lt;tbody&gt;
60+
&lt;tr&gt;&lt;td&gt;Alice&lt;/td&gt;&lt;td&gt;30&lt;/td&gt;&lt;/tr&gt;
61+
&lt;tr&gt;&lt;td&gt;Bob&lt;/td&gt;&lt;td&gt;25&lt;/td&gt;&lt;/tr&gt;
62+
&lt;/tbody&gt;
63+
&lt;/table&gt;`;
64+
65+
const [df] = readHtml(html);
66+
console.log(df.columns); // ["Name", "Age"]
67+
console.log(df.shape); // [2, 2]
68+
console.log(df.toRecords());
69+
// [{ Name: "Alice", Age: 30 }, { Name: "Bob", Age: 25 }]</code></pre>
70+
71+
<h2>Supported Options</h2>
72+
<ul class="example-list">
73+
<li><code>header</code> — which row to use as column names (default <code>0</code>). Use <code>null</code> for no header.</li>
74+
<li><code>indexCol</code> — column name or index to use as the row index.</li>
75+
<li><code>match</code> — array of table indices to return (e.g. <code>[0, 2]</code>).</li>
76+
<li><code>naValues</code> — extra strings to treat as NaN (default includes <code>""</code>, <code>"NA"</code>, <code>"NaN"</code>, <code>"None"</code>).</li>
77+
<li><code>converters</code> — try to convert cells to numbers (default <code>true</code>).</li>
78+
<li><code>thousands</code> — thousands-separator character, e.g. <code>","</code>.</li>
79+
<li><code>decimal</code> — decimal separator, default <code>"."</code>.</li>
80+
<li><code>skipRows</code> — 0-based row indices to skip in the body.</li>
81+
<li><code>nrows</code> — maximum rows to return.</li>
82+
<li><code>skipBlankLines</code> — skip rows where all cells are whitespace (default <code>true</code>).</li>
83+
</ul>
84+
85+
<script type="module">
86+
// In a real app: import { readHtml } from "tsb"
87+
// Here we inline the core logic for the demo.
88+
89+
// ── mini HTML parser (same as src/io/read_html.ts) ──────────────────────
90+
function stripTags(html) {
91+
return html
92+
.replace(/<[^>]*>/g, "")
93+
.replace(/&amp;/gi, "&").replace(/&lt;/gi, "<")
94+
.replace(/&gt;/gi, ">").replace(/&nbsp;/gi, " ")
95+
.replace(/&quot;/gi, '"')
96+
.replace(/&#(\d+);/g, (_, c) => String.fromCharCode(Number(c)))
97+
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
98+
}
99+
function extractBlocks(html, tag) {
100+
const open = new RegExp(`<${tag}(?:\\s[^>]*)?>`, "gi");
101+
const close = new RegExp(`</${tag}>`, "gi");
102+
const results = [];
103+
let m;
104+
open.lastIndex = 0;
105+
while ((m = open.exec(html)) !== null) {
106+
const start = m.index + m[0].length;
107+
close.lastIndex = start;
108+
const end = close.exec(html);
109+
if (end) results.push(html.slice(start, end.index));
110+
}
111+
return results;
112+
}
113+
function extractTables(html) {
114+
const tables = [], allTags = /<\/?table(?:\s[^>]*)?>/gi;
115+
let depth = 0, start = -1, m;
116+
while ((m = allTags.exec(html)) !== null) {
117+
if (!m[0].toLowerCase().startsWith("</")) {
118+
if (depth === 0) start = m.index;
119+
depth++;
120+
} else {
121+
if (--depth === 0 && start >= 0) { tables.push(html.slice(start, m.index + m[0].length)); start = -1; }
122+
}
123+
}
124+
return tables;
125+
}
126+
function parseRows(sectionHtml) {
127+
const rows = [];
128+
for (const rowHtml of extractBlocks(sectionHtml, "tr")) {
129+
const cells = [], cp = /<t[dh](?:\s[^>]*)?>([\s\S]*?)<\/t[dh]>/gi;
130+
let cm;
131+
while ((cm = cp.exec(rowHtml)) !== null) cells.push(stripTags(cm[1] ?? "").trim());
132+
rows.push(cells);
133+
}
134+
return rows;
135+
}
136+
function parseTableHtml(tableHtml) {
137+
const rows = [];
138+
for (const t of extractBlocks(tableHtml, "thead")) for (const r of parseRows(t)) rows.push(r);
139+
const tbodies = extractBlocks(tableHtml, "tbody");
140+
if (tbodies.length) { for (const t of tbodies) for (const r of parseRows(t)) rows.push(r); }
141+
else { const s = tableHtml.replace(/<thead[\s\S]*?<\/thead>/gi,"").replace(/<tfoot[\s\S]*?<\/tfoot>/gi,""); for (const r of parseRows(s)) rows.push(r); }
142+
for (const t of extractBlocks(tableHtml, "tfoot")) for (const r of parseRows(t)) rows.push(r);
143+
return rows;
144+
}
145+
const DEFAULT_NA = new Set(["","NA","NaN","N/A","null","None","nan"]);
146+
function coerce(raw, na, num, thousands, decimal) {
147+
if (na.has(raw)) return null;
148+
if (!num) return raw;
149+
let s = thousands ? raw.split(thousands).join("") : raw;
150+
if (decimal !== ".") s = s.replace(decimal, ".");
151+
const n = Number(s);
152+
return (!isNaN(n) && s.trim() !== "") ? n : raw;
153+
}
154+
function readHtml(html, opts = {}) {
155+
const { header=0, indexCol=null, match: matchOpt, naValues, converters=true, skipBlankLines=true, skipRows, nrows, thousands=null, decimal="." } = opts;
156+
const na = naValues ? new Set(naValues) : DEFAULT_NA;
157+
const tables = extractTables(html);
158+
const result = [];
159+
tables.forEach((tableHtml, ti) => {
160+
if (matchOpt && !matchOpt.includes(ti)) return;
161+
let raw = parseTableHtml(tableHtml);
162+
let cols, dataStart;
163+
if (header === null) { const nc = raw[0]?.length ?? 0; cols = Array.from({length:nc},(_,i)=>String(i)); dataStart=0; }
164+
else { const hr = raw[header] ?? []; const seen=new Map(); cols=hr.map(n=>{const e=seen.get(n);if(e!==undefined){const r=`${n}.${e}`;seen.set(n,e+1);return r;}seen.set(n,1);return n;}); dataStart=header+1; }
165+
let body = raw.slice(dataStart);
166+
if (skipRows?.length) { const ss=new Set(skipRows); body=body.filter((_,i)=>!ss.has(i)); }
167+
if (skipBlankLines) body=body.filter(r=>r.some(c=>c.trim()!==""));
168+
if (nrows!==undefined) body=body.slice(0,nrows);
169+
const nc=cols.length, colArrays=Array.from({length:nc},()=>[]);
170+
for (const row of body) for (let ci=0;ci<nc;ci++) colArrays[ci].push(coerce(row[ci]??'',na,converters,thousands,decimal));
171+
result.push({ columns: cols, data: colArrays, indexCol });
172+
});
173+
return result;
174+
}
175+
176+
// ── demo renderer ──────────────────────────────────────────────────────────
177+
window.run = function () {
178+
const htmlRaw = document.getElementById("html-input").value;
179+
const headerVal = document.getElementById("header-input").value;
180+
const convertersVal = document.getElementById("converters-input").checked;
181+
const nrowsVal = document.getElementById("nrows-input").value;
182+
183+
const opts = {
184+
header: headerVal === "" ? null : Number(headerVal),
185+
converters: convertersVal,
186+
};
187+
if (nrowsVal) opts.nrows = Number(nrowsVal);
188+
189+
let dfs;
190+
try {
191+
// Decode textarea HTML entities back to raw HTML
192+
const ta = document.createElement("textarea");
193+
ta.innerHTML = htmlRaw;
194+
dfs = readHtml(ta.value, opts);
195+
} catch (e) {
196+
document.getElementById("output").innerHTML = `<pre style="color:red">${e}</pre>`;
197+
return;
198+
}
199+
200+
if (dfs.length === 0) {
201+
document.getElementById("output").innerHTML = "<p>No tables found.</p>";
202+
return;
203+
}
204+
205+
let html = `<p>Found <strong>${dfs.length}</strong> table(s).</p>`;
206+
for (let i = 0; i < dfs.length; i++) {
207+
const df = dfs[i];
208+
html += `<h3>Table ${i}</h3>`;
209+
html += `<p>Shape: ${df.data[0]?.length ?? 0} rows × ${df.columns.length} cols</p>`;
210+
html += "<table><thead><tr>";
211+
for (const c of df.columns) html += `<th>${c}</th>`;
212+
html += "</tr></thead><tbody>";
213+
const nrow = df.data[0]?.length ?? 0;
214+
for (let r = 0; r < nrow; r++) {
215+
html += "<tr>";
216+
for (let c = 0; c < df.columns.length; c++) {
217+
const v = df.data[c]?.[r];
218+
html += `<td>${v === null ? "<em>null</em>" : v}</td>`;
219+
}
220+
html += "</tr>";
221+
}
222+
html += "</tbody></table>";
223+
}
224+
document.getElementById("output").innerHTML = html;
225+
};
226+
227+
// Auto-run on load
228+
window.run();
229+
</script>
230+
</body>
231+
</html>

src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ export { jsonNormalize } from "./io/index.ts";
6060
export type { JsonNormalizeOptions, JsonPath } from "./io/index.ts";
6161
export { toJsonDenormalize, toJsonRecords, toJsonSplit, toJsonIndex } from "./io/index.ts";
6262
export type { JsonDenormalizeOptions, JsonSplitOptions, JsonSplitResult } from "./io/index.ts";
63+
export { readHtml } from "./io/index.ts";
64+
export type { ReadHtmlOptions } from "./io/index.ts";
6365
export { pearsonCorr, dataFrameCorr, dataFrameCov } from "./stats/index.ts";
6466
export type { CorrMethod, CorrOptions, CovOptions } from "./stats/index.ts";
6567
export { Rolling } from "./window/index.ts";

src/io/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ export type {
2121
JsonSplitOptions,
2222
JsonSplitResult,
2323
} from "./to_json_normalize.ts";
24+
export { readHtml } from "./read_html.ts";
25+
export type { ReadHtmlOptions } from "./read_html.ts";
26+
2427
// readExcel / xlsxSheetNames use node:zlib and cannot be bundled for the
2528
// browser. Import them directly from "tsb/io/read_excel" when running in
2629
// Node / Bun.

0 commit comments

Comments
 (0)