Skip to content

Commit 4c06d75

Browse files
Copilotmrjf
andauthored
Fix CI for read_html iteration
Co-authored-by: mrjf <180956+mrjf@users.noreply.github.com>
1 parent 0683e86 commit 4c06d75

4 files changed

Lines changed: 70 additions & 52 deletions

File tree

src/io/read_html.ts

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,7 @@ function stripTags(html: string): string {
9898
.replace(/&gt;/gi, ">")
9999
.replace(/&nbsp;/gi, " ")
100100
.replace(/&quot;/gi, '"')
101-
.replace(/&#(\d+);/g, (_, code: string) =>
102-
String.fromCharCode(Number(code)),
103-
)
101+
.replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(Number(code)))
104102
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex: string) =>
105103
String.fromCharCode(Number.parseInt(hex, 16)),
106104
);
@@ -129,29 +127,26 @@ function extractBlocks(html: string, tag: string): string[] {
129127

130128
/** Extract all `<table>…</table>` blocks (nested tables are not unwrapped). */
131129
function extractTables(html: string): string[] {
132-
const tables: string[] = [];
133-
let depth = 0;
134-
let start = -1;
130+
const tables: { start: number; html: string }[] = [];
131+
const starts: number[] = [];
135132

136-
// Walk through all <table> and </table> tags to handle nesting.
137133
const allTags = /<\/?table(?:\s[^>]*)?>/gi;
138134
let m: RegExpExecArray | null;
139135
while ((m = allTags.exec(html)) !== null) {
140136
const tag = m[0].toLowerCase();
141-
if (!tag.startsWith("</")) {
142-
if (depth === 0) {
143-
start = m.index;
137+
if (tag.startsWith("</")) {
138+
const start = starts.pop();
139+
if (start !== undefined) {
140+
tables.push({
141+
start,
142+
html: html.slice(start, m.index + m[0].length),
143+
});
144144
}
145-
depth++;
146145
} else {
147-
depth--;
148-
if (depth === 0 && start >= 0) {
149-
tables.push(html.slice(start, m.index + m[0].length));
150-
start = -1;
151-
}
146+
starts.push(m.index);
152147
}
153148
}
154-
return tables;
149+
return tables.sort((a, b) => a.start - b.start).map((t) => t.html);
155150
}
156151

157152
/** Parse `<tr>` blocks out of a table section (`<thead>` or `<tbody>`). */
@@ -334,9 +329,7 @@ export function readHtml(html: string, opts: ReadHtmlOptions = {}): DataFrame[]
334329
for (const row of bodyRows) {
335330
for (let ci = 0; ci < ncols; ci++) {
336331
const raw = row[ci] ?? "";
337-
colArrays[ci]!.push(
338-
coerceValue(raw, naSet, converters, thousands, decimal),
339-
);
332+
colArrays[ci]!.push(coerceValue(raw, naSet, converters, thousands, decimal));
340333
}
341334
}
342335

tests-e2e/playground-cells.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ const NON_PLAYGROUND_PAGES = new Set<string>([
5757
"examples.html",
5858
"extensions.html",
5959
"format_table.html",
60+
"read_html.html",
6061
]);
6162

6263
const PORT = 3399;

tests/io/read_html.test.ts

Lines changed: 54 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,25 @@ import { readHtml } from "../../src/index.ts";
1010

1111
function simpleTable(headers: string[], rows: string[][]): string {
1212
const thRow = headers.map((h) => `<th>${h}</th>`).join("");
13-
const trRows = rows
14-
.map((r) => `<tr>${r.map((c) => `<td>${c}</td>`).join("")}</tr>`)
15-
.join("\n");
13+
const trRows = rows.map((r) => `<tr>${r.map((c) => `<td>${c}</td>`).join("")}</tr>`).join("\n");
1614
return `<table><thead><tr>${thRow}</tr></thead><tbody>${trRows}</tbody></table>`;
1715
}
1816

1917
// ─── basic parsing ────────────────────────────────────────────────────────────
2018

2119
describe("readHtml – basic", () => {
2220
test("parses single table", () => {
23-
const html = simpleTable(["a", "b"], [["1", "2"], ["3", "4"]]);
21+
const html = simpleTable(
22+
["a", "b"],
23+
[
24+
["1", "2"],
25+
["3", "4"],
26+
],
27+
);
2428
const dfs = readHtml(html);
2529
expect(dfs.length).toBe(1);
2630
const df = dfs[0]!;
27-
expect(df.columns).toEqual(["a", "b"]);
31+
expect(df.columns.toArray()).toEqual(["a", "b"]);
2832
expect(df.shape).toEqual([2, 2]);
2933
});
3034

@@ -33,8 +37,8 @@ describe("readHtml – basic", () => {
3337
const t2 = simpleTable(["y"], [["20"]]);
3438
const dfs = readHtml(t1 + t2);
3539
expect(dfs.length).toBe(2);
36-
expect(dfs[0]!.columns).toEqual(["x"]);
37-
expect(dfs[1]!.columns).toEqual(["y"]);
40+
expect(dfs[0]!.columns.toArray()).toEqual(["x"]);
41+
expect(dfs[1]!.columns.toArray()).toEqual(["y"]);
3842
});
3943

4044
test("returns empty array when no tables found", () => {
@@ -57,9 +61,9 @@ describe("readHtml – basic", () => {
5761
});
5862

5963
test("header=null uses integer column names", () => {
60-
const html = `<table><tr><td>a</td><td>b</td></tr><tr><td>1</td><td>2</td></tr></table>`;
64+
const html = "<table><tr><td>a</td><td>b</td></tr><tr><td>1</td><td>2</td></tr></table>";
6165
const [df] = readHtml(html, { header: null });
62-
expect(df!.columns).toEqual(["0", "1"]);
66+
expect(df!.columns.toArray()).toEqual(["0", "1"]);
6367
expect(df!.shape[0]).toBe(2);
6468
});
6569
});
@@ -73,7 +77,7 @@ describe("readHtml – header", () => {
7377
<tr><td>Alice</td><td>30</td></tr>
7478
</table>`;
7579
const [df] = readHtml(html, { header: 0 });
76-
expect(df!.columns).toEqual(["Name", "Age"]);
80+
expect(df!.columns.toArray()).toEqual(["Name", "Age"]);
7781
expect(df!.shape[0]).toBe(1);
7882
});
7983

@@ -83,9 +87,10 @@ describe("readHtml – header", () => {
8387
<tr><td>1</td><td>2</td><td>3</td></tr>
8488
</table>`;
8589
const [df] = readHtml(html);
86-
expect(df!.columns[0]).toBe("x");
87-
expect(df!.columns[1]).toBe("x.1");
88-
expect(df!.columns[2]).toBe("y");
90+
const cols = df!.columns.toArray();
91+
expect(cols[0]).toBe("x");
92+
expect(cols[1]).toBe("x.1");
93+
expect(cols[2]).toBe("y");
8994
});
9095
});
9196

@@ -94,7 +99,7 @@ describe("readHtml – header", () => {
9499
describe("readHtml – NA values", () => {
95100
test("empty string becomes null", () => {
96101
const html = simpleTable(["v"], [[""], ["1"]]);
97-
const [df] = readHtml(html);
102+
const [df] = readHtml(html, { skipBlankLines: false });
98103
expect(df!.col("v").toArray()[0]).toBeNull();
99104
expect(df!.col("v").toArray()[1]).toBe(1);
100105
});
@@ -133,7 +138,7 @@ describe("readHtml – converters", () => {
133138
test("decimal separator", () => {
134139
const html = simpleTable(["n"], [["3,14"]]);
135140
const [df] = readHtml(html, { decimal: "," });
136-
expect((df!.col("n").toArray()[0] as number)).toBeCloseTo(3.14);
141+
expect(df!.col("n").toArray()[0] as number).toBeCloseTo(3.14);
137142
});
138143
});
139144

@@ -146,7 +151,7 @@ describe("readHtml – filtering", () => {
146151
const t2 = simpleTable(["c"], [["3"]]);
147152
const dfs = readHtml(t0 + t1 + t2, { match: [1] });
148153
expect(dfs.length).toBe(1);
149-
expect(dfs[0]!.columns).toEqual(["b"]);
154+
expect(dfs[0]!.columns.toArray()).toEqual(["b"]);
150155
});
151156

152157
test("skipRows", () => {
@@ -181,18 +186,30 @@ describe("readHtml – filtering", () => {
181186

182187
describe("readHtml – indexCol", () => {
183188
test("sets named column as index", () => {
184-
const html = simpleTable(["id", "val"], [["a", "1"], ["b", "2"]]);
189+
const html = simpleTable(
190+
["id", "val"],
191+
[
192+
["a", "1"],
193+
["b", "2"],
194+
],
195+
);
185196
const [df] = readHtml(html, { indexCol: "id" });
186197
// "id" column removed from columns
187-
expect(df!.columns).toEqual(["val"]);
198+
expect(df!.columns.toArray()).toEqual(["val"]);
188199
// index contains "a", "b"
189200
expect(df!.index.toArray()).toEqual(["a", "b"]);
190201
});
191202

192203
test("sets column by integer position as index", () => {
193-
const html = simpleTable(["id", "val"], [["x", "10"], ["y", "20"]]);
204+
const html = simpleTable(
205+
["id", "val"],
206+
[
207+
["x", "10"],
208+
["y", "20"],
209+
],
210+
);
194211
const [df] = readHtml(html, { indexCol: 0 });
195-
expect(df!.columns).toEqual(["val"]);
212+
expect(df!.columns.toArray()).toEqual(["val"]);
196213
});
197214
});
198215

@@ -235,7 +252,7 @@ describe("readHtml – structure variants", () => {
235252
<tr><td>1</td><td>2</td></tr>
236253
</table>`;
237254
const [df] = readHtml(html);
238-
expect(df!.columns).toEqual(["x", "y"]);
255+
expect(df!.columns.toArray()).toEqual(["x", "y"]);
239256
expect(df!.shape[0]).toBe(1);
240257
});
241258

@@ -257,7 +274,7 @@ describe("readHtml – structure variants", () => {
257274
<tr><td id="c1">Alice</td></tr>
258275
</table>`;
259276
const [df] = readHtml(html, { converters: false });
260-
expect(df!.columns).toEqual(["Name"]);
277+
expect(df!.columns.toArray()).toEqual(["Name"]);
261278
expect(df!.col("Name").toArray()[0]).toBe("Alice");
262279
});
263280

@@ -276,19 +293,24 @@ describe("readHtml – property tests", () => {
276293
test("roundtrip: all numeric values survive parse", () => {
277294
fc.assert(
278295
fc.property(
279-
fc.array(fc.array(fc.integer({ min: -1000, max: 1000 }), { minLength: 1, maxLength: 5 }), {
280-
minLength: 1,
281-
maxLength: 10,
282-
}),
296+
fc.integer({ min: 1, max: 5 }).chain((ncols) =>
297+
fc.array(
298+
fc.array(fc.integer({ min: -1000, max: 1000 }), {
299+
minLength: ncols,
300+
maxLength: ncols,
301+
}),
302+
{ minLength: 1, maxLength: 10 },
303+
),
304+
),
283305
(rows) => {
284306
const ncols = rows[0]!.length;
285307
const headers = Array.from({ length: ncols }, (_, i) => `col${i}`);
286308
const strRows = rows.map((r) => r.map(String));
287309
const html = simpleTable(headers, strRows);
288310
const [df] = readHtml(html);
289311
const flatIn = rows.flat();
290-
const flatOut = headers.flatMap((h) =>
291-
(df?.col(h).toArray() ?? []).map(Number),
312+
const flatOut = (df?.toRecords() ?? []).flatMap((record) =>
313+
rows[0]!.map((_, ci) => Number(record[headers[ci]!])),
292314
);
293315
// same length
294316
if (flatIn.length !== flatOut.length) return false;
@@ -302,9 +324,9 @@ describe("readHtml – property tests", () => {
302324
test("number of returned DataFrames equals number of tables in HTML", () => {
303325
fc.assert(
304326
fc.property(fc.integer({ min: 0, max: 6 }), (n) => {
305-
const tables = Array.from({ length: n }, (_, i) =>
306-
simpleTable([`c${i}`], [["1"]]),
307-
).join(" ");
327+
const tables = Array.from({ length: n }, (_, i) => simpleTable([`c${i}`], [["1"]])).join(
328+
" ",
329+
);
308330
const dfs = readHtml(tables);
309331
return dfs.length === n;
310332
}),
@@ -332,7 +354,7 @@ describe("readHtml – realistic HTML", () => {
332354

333355
test("parses Wikipedia-style table from full HTML doc", () => {
334356
const [df] = readHtml(wikipedia);
335-
expect(df!.columns).toEqual(["Country", "Population (M)", "GDP (B USD)"]);
357+
expect(df!.columns.toArray()).toEqual(["Country", "Population (M)", "GDP (B USD)"]);
336358
expect(df!.shape).toEqual([3, 3]);
337359
});
338360

tests/playground.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ const NON_PLAYGROUND_PAGES = new Set<string>([
3232
"examples.html",
3333
// The extensions page is a static reference page — no interactive code blocks.
3434
"extensions.html",
35+
// read_html is a standalone tutorial page, not a runtime-backed code-cell playground.
36+
"read_html.html",
3537
]);
3638

3739
const REAL_WORLD_EXAMPLE_PAGES = [

0 commit comments

Comments
 (0)