|
| 1 | +<!DOCTYPE html> |
| 2 | +<html lang="en"> |
| 3 | +<head> |
| 4 | + <meta charset="UTF-8" /> |
| 5 | + <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| 6 | + <title>tsb – readHtml() playground</title> |
| 7 | + <style> |
| 8 | + body { font-family: system-ui, sans-serif; max-width: 860px; margin: 2rem auto; padding: 0 1rem; } |
| 9 | + h1 { color: #2563eb; } |
| 10 | + h2 { color: #1e40af; margin-top: 2rem; } |
| 11 | + textarea { width: 100%; font-family: monospace; font-size: 13px; border: 1px solid #d1d5db; border-radius: 6px; padding: 8px; box-sizing: border-box; } |
| 12 | + pre { background: #f1f5f9; padding: 1rem; border-radius: 6px; overflow-x: auto; font-size: 13px; } |
| 13 | + button { background: #2563eb; color: white; border: none; border-radius: 6px; padding: 8px 18px; cursor: pointer; font-size: 14px; } |
| 14 | + button:hover { background: #1d4ed8; } |
| 15 | + .output { margin-top: 1rem; } |
| 16 | + table { border-collapse: collapse; width: 100%; margin-top: 0.5rem; } |
| 17 | + th, td { border: 1px solid #d1d5db; padding: 6px 12px; text-align: left; font-size: 13px; } |
| 18 | + th { background: #f1f5f9; font-weight: 600; } |
| 19 | + .example-list li { margin-bottom: 0.4rem; } |
| 20 | + </style> |
| 21 | +</head> |
| 22 | +<body> |
| 23 | + <h1>🐼 tsb – <code>readHtml()</code></h1> |
| 24 | + <p> |
| 25 | + <code>readHtml(html, opts?)</code> mirrors <a href="https://pandas.pydata.org/docs/reference/api/pandas.read_html.html" target="_blank"><code>pandas.read_html()</code></a>. |
| 26 | + It scans an HTML string for <code><table></code> elements and returns one <strong>DataFrame</strong> per table found. |
| 27 | + </p> |
| 28 | + |
| 29 | + <h2>Live Demo</h2> |
| 30 | + <p>Paste or edit HTML below, then click <strong>Parse</strong>.</p> |
| 31 | + <textarea id="html-input" rows="14"> |
| 32 | +<table> |
| 33 | + <thead> |
| 34 | + <tr><th>Country</th><th>Population (M)</th><th>GDP (B USD)</th></tr> |
| 35 | + </thead> |
| 36 | + <tbody> |
| 37 | + <tr><td>USA</td><td>331</td><td>23000</td></tr> |
| 38 | + <tr><td>China</td><td>1411</td><td>17700</td></tr> |
| 39 | + <tr><td>Germany</td><td>84</td><td>4500</td></tr> |
| 40 | + </tbody> |
| 41 | +</table> |
| 42 | + </textarea> |
| 43 | + <br /><br /> |
| 44 | + <label>header row index: <input id="header-input" type="number" value="0" style="width:60px" /></label> |
| 45 | + |
| 46 | + <label>converters (numeric): <input id="converters-input" type="checkbox" checked /></label> |
| 47 | + |
| 48 | + <label>nrows: <input id="nrows-input" type="number" placeholder="all" style="width:60px" /></label> |
| 49 | + <br /><br /> |
| 50 | + <button onclick="run()">▶ Parse</button> |
| 51 | + |
| 52 | + <div id="output" class="output"></div> |
| 53 | + |
| 54 | + <h2>Code Example</h2> |
| 55 | + <pre><code>import { readHtml } from "tsb"; |
| 56 | + |
| 57 | +const html = `<table> |
| 58 | + <thead><tr><th>Name</th><th>Age</th></tr></thead> |
| 59 | + <tbody> |
| 60 | + <tr><td>Alice</td><td>30</td></tr> |
| 61 | + <tr><td>Bob</td><td>25</td></tr> |
| 62 | + </tbody> |
| 63 | +</table>`; |
| 64 | + |
| 65 | +const [df] = readHtml(html); |
| 66 | +console.log(df.columns); // ["Name", "Age"] |
| 67 | +console.log(df.shape); // [2, 2] |
| 68 | +console.log(df.toRecords()); |
| 69 | +// [{ Name: "Alice", Age: 30 }, { Name: "Bob", Age: 25 }]</code></pre> |
| 70 | + |
| 71 | + <h2>Supported Options</h2> |
| 72 | + <ul class="example-list"> |
| 73 | + <li><code>header</code> — which row to use as column names (default <code>0</code>). Use <code>null</code> for no header.</li> |
| 74 | + <li><code>indexCol</code> — column name or index to use as the row index.</li> |
| 75 | + <li><code>match</code> — array of table indices to return (e.g. <code>[0, 2]</code>).</li> |
| 76 | + <li><code>naValues</code> — extra strings to treat as NaN (default includes <code>""</code>, <code>"NA"</code>, <code>"NaN"</code>, <code>"None"</code>).</li> |
| 77 | + <li><code>converters</code> — try to convert cells to numbers (default <code>true</code>).</li> |
| 78 | + <li><code>thousands</code> — thousands-separator character, e.g. <code>","</code>.</li> |
| 79 | + <li><code>decimal</code> — decimal separator, default <code>"."</code>.</li> |
| 80 | + <li><code>skipRows</code> — 0-based row indices to skip in the body.</li> |
| 81 | + <li><code>nrows</code> — maximum rows to return.</li> |
| 82 | + <li><code>skipBlankLines</code> — skip rows where all cells are whitespace (default <code>true</code>).</li> |
| 83 | + </ul> |
| 84 | + |
| 85 | + <script type="module"> |
| 86 | + // In a real app: import { readHtml } from "tsb" |
| 87 | + // Here we inline the core logic for the demo. |
| 88 | + |
| 89 | + // ── mini HTML parser (same as src/io/read_html.ts) ────────────────────── |
| 90 | + function stripTags(html) { |
| 91 | + return html |
| 92 | + .replace(/<[^>]*>/g, "") |
| 93 | + .replace(/&/gi, "&").replace(/</gi, "<") |
| 94 | + .replace(/>/gi, ">").replace(/ /gi, " ") |
| 95 | + .replace(/"/gi, '"') |
| 96 | + .replace(/&#(\d+);/g, (_, c) => String.fromCharCode(Number(c))) |
| 97 | + .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16))); |
| 98 | + } |
| 99 | + function extractBlocks(html, tag) { |
| 100 | + const open = new RegExp(`<${tag}(?:\\s[^>]*)?>`, "gi"); |
| 101 | + const close = new RegExp(`</${tag}>`, "gi"); |
| 102 | + const results = []; |
| 103 | + let m; |
| 104 | + open.lastIndex = 0; |
| 105 | + while ((m = open.exec(html)) !== null) { |
| 106 | + const start = m.index + m[0].length; |
| 107 | + close.lastIndex = start; |
| 108 | + const end = close.exec(html); |
| 109 | + if (end) results.push(html.slice(start, end.index)); |
| 110 | + } |
| 111 | + return results; |
| 112 | + } |
| 113 | + function extractTables(html) { |
| 114 | + const tables = [], allTags = /<\/?table(?:\s[^>]*)?>/gi; |
| 115 | + let depth = 0, start = -1, m; |
| 116 | + while ((m = allTags.exec(html)) !== null) { |
| 117 | + if (!m[0].toLowerCase().startsWith("</")) { |
| 118 | + if (depth === 0) start = m.index; |
| 119 | + depth++; |
| 120 | + } else { |
| 121 | + if (--depth === 0 && start >= 0) { tables.push(html.slice(start, m.index + m[0].length)); start = -1; } |
| 122 | + } |
| 123 | + } |
| 124 | + return tables; |
| 125 | + } |
| 126 | + function parseRows(sectionHtml) { |
| 127 | + const rows = []; |
| 128 | + for (const rowHtml of extractBlocks(sectionHtml, "tr")) { |
| 129 | + const cells = [], cp = /<t[dh](?:\s[^>]*)?>([\s\S]*?)<\/t[dh]>/gi; |
| 130 | + let cm; |
| 131 | + while ((cm = cp.exec(rowHtml)) !== null) cells.push(stripTags(cm[1] ?? "").trim()); |
| 132 | + rows.push(cells); |
| 133 | + } |
| 134 | + return rows; |
| 135 | + } |
| 136 | + function parseTableHtml(tableHtml) { |
| 137 | + const rows = []; |
| 138 | + for (const t of extractBlocks(tableHtml, "thead")) for (const r of parseRows(t)) rows.push(r); |
| 139 | + const tbodies = extractBlocks(tableHtml, "tbody"); |
| 140 | + if (tbodies.length) { for (const t of tbodies) for (const r of parseRows(t)) rows.push(r); } |
| 141 | + else { const s = tableHtml.replace(/<thead[\s\S]*?<\/thead>/gi,"").replace(/<tfoot[\s\S]*?<\/tfoot>/gi,""); for (const r of parseRows(s)) rows.push(r); } |
| 142 | + for (const t of extractBlocks(tableHtml, "tfoot")) for (const r of parseRows(t)) rows.push(r); |
| 143 | + return rows; |
| 144 | + } |
| 145 | + const DEFAULT_NA = new Set(["","NA","NaN","N/A","null","None","nan"]); |
| 146 | + function coerce(raw, na, num, thousands, decimal) { |
| 147 | + if (na.has(raw)) return null; |
| 148 | + if (!num) return raw; |
| 149 | + let s = thousands ? raw.split(thousands).join("") : raw; |
| 150 | + if (decimal !== ".") s = s.replace(decimal, "."); |
| 151 | + const n = Number(s); |
| 152 | + return (!isNaN(n) && s.trim() !== "") ? n : raw; |
| 153 | + } |
| 154 | + function readHtml(html, opts = {}) { |
| 155 | + const { header=0, indexCol=null, match: matchOpt, naValues, converters=true, skipBlankLines=true, skipRows, nrows, thousands=null, decimal="." } = opts; |
| 156 | + const na = naValues ? new Set(naValues) : DEFAULT_NA; |
| 157 | + const tables = extractTables(html); |
| 158 | + const result = []; |
| 159 | + tables.forEach((tableHtml, ti) => { |
| 160 | + if (matchOpt && !matchOpt.includes(ti)) return; |
| 161 | + let raw = parseTableHtml(tableHtml); |
| 162 | + let cols, dataStart; |
| 163 | + if (header === null) { const nc = raw[0]?.length ?? 0; cols = Array.from({length:nc},(_,i)=>String(i)); dataStart=0; } |
| 164 | + else { const hr = raw[header] ?? []; const seen=new Map(); cols=hr.map(n=>{const e=seen.get(n);if(e!==undefined){const r=`${n}.${e}`;seen.set(n,e+1);return r;}seen.set(n,1);return n;}); dataStart=header+1; } |
| 165 | + let body = raw.slice(dataStart); |
| 166 | + if (skipRows?.length) { const ss=new Set(skipRows); body=body.filter((_,i)=>!ss.has(i)); } |
| 167 | + if (skipBlankLines) body=body.filter(r=>r.some(c=>c.trim()!=="")); |
| 168 | + if (nrows!==undefined) body=body.slice(0,nrows); |
| 169 | + const nc=cols.length, colArrays=Array.from({length:nc},()=>[]); |
| 170 | + for (const row of body) for (let ci=0;ci<nc;ci++) colArrays[ci].push(coerce(row[ci]??'',na,converters,thousands,decimal)); |
| 171 | + result.push({ columns: cols, data: colArrays, indexCol }); |
| 172 | + }); |
| 173 | + return result; |
| 174 | + } |
| 175 | + |
| 176 | + // ── demo renderer ────────────────────────────────────────────────────────── |
| 177 | + window.run = function () { |
| 178 | + const htmlRaw = document.getElementById("html-input").value; |
| 179 | + const headerVal = document.getElementById("header-input").value; |
| 180 | + const convertersVal = document.getElementById("converters-input").checked; |
| 181 | + const nrowsVal = document.getElementById("nrows-input").value; |
| 182 | + |
| 183 | + const opts = { |
| 184 | + header: headerVal === "" ? null : Number(headerVal), |
| 185 | + converters: convertersVal, |
| 186 | + }; |
| 187 | + if (nrowsVal) opts.nrows = Number(nrowsVal); |
| 188 | + |
| 189 | + let dfs; |
| 190 | + try { |
| 191 | + // Decode textarea HTML entities back to raw HTML |
| 192 | + const ta = document.createElement("textarea"); |
| 193 | + ta.innerHTML = htmlRaw; |
| 194 | + dfs = readHtml(ta.value, opts); |
| 195 | + } catch (e) { |
| 196 | + document.getElementById("output").innerHTML = `<pre style="color:red">${e}</pre>`; |
| 197 | + return; |
| 198 | + } |
| 199 | + |
| 200 | + if (dfs.length === 0) { |
| 201 | + document.getElementById("output").innerHTML = "<p>No tables found.</p>"; |
| 202 | + return; |
| 203 | + } |
| 204 | + |
| 205 | + let html = `<p>Found <strong>${dfs.length}</strong> table(s).</p>`; |
| 206 | + for (let i = 0; i < dfs.length; i++) { |
| 207 | + const df = dfs[i]; |
| 208 | + html += `<h3>Table ${i}</h3>`; |
| 209 | + html += `<p>Shape: ${df.data[0]?.length ?? 0} rows × ${df.columns.length} cols</p>`; |
| 210 | + html += "<table><thead><tr>"; |
| 211 | + for (const c of df.columns) html += `<th>${c}</th>`; |
| 212 | + html += "</tr></thead><tbody>"; |
| 213 | + const nrow = df.data[0]?.length ?? 0; |
| 214 | + for (let r = 0; r < nrow; r++) { |
| 215 | + html += "<tr>"; |
| 216 | + for (let c = 0; c < df.columns.length; c++) { |
| 217 | + const v = df.data[c]?.[r]; |
| 218 | + html += `<td>${v === null ? "<em>null</em>" : v}</td>`; |
| 219 | + } |
| 220 | + html += "</tr>"; |
| 221 | + } |
| 222 | + html += "</tbody></table>"; |
| 223 | + } |
| 224 | + document.getElementById("output").innerHTML = html; |
| 225 | + }; |
| 226 | + |
| 227 | + // Auto-run on load |
| 228 | + window.run(); |
| 229 | + </script> |
| 230 | +</body> |
| 231 | +</html> |
0 commit comments