Skip to content

Commit f3a33d6

Browse files
authored
Merge pull request #227 from githubnext/copilot/autoloopport-str-get-dummies
Extract `strGetDummies` into a dedicated module
2 parents 37dc6ff + 917c7e6 commit f3a33d6

7 files changed

Lines changed: 542 additions & 153 deletions

File tree

playground/str_get_dummies.html

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
6+
<title>tsb — str.get_dummies: multi-label string encoding</title>
7+
<style>
8+
* { box-sizing: border-box; margin: 0; padding: 0; }
9+
body { font-family: system-ui, sans-serif; background: #0d1117; color: #c9d1d9; line-height: 1.6; padding: 2rem; }
10+
h1 { color: #58a6ff; font-size: 1.8rem; margin-bottom: .5rem; }
11+
h2 { color: #79c0ff; font-size: 1.2rem; margin: 2rem 0 .75rem; }
12+
p { color: #8b949e; margin-bottom: 1rem; max-width: 800px; }
13+
code { background: #161b22; padding: .1rem .4rem; border-radius: 4px; font-family: monospace; font-size: .9em; color: #a5d6ff; }
14+
.card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; max-width: 900px; }
15+
textarea { width: 100%; background: #0d1117; border: 1px solid #30363d; border-radius: 6px; color: #c9d1d9; font-family: monospace; font-size: .85rem; padding: .75rem; resize: vertical; min-height: 140px; }
16+
button { background: #238636; color: #fff; border: none; border-radius: 6px; padding: .5rem 1.25rem; cursor: pointer; font-size: .9rem; margin-top: .75rem; }
17+
button:hover { background: #2ea043; }
18+
pre { background: #0d1117; border: 1px solid #21262d; border-radius: 6px; padding: 0.75rem 1rem; overflow-x: auto; font-size: 0.85rem; white-space: pre-wrap; margin-top: 0.5rem; color: #7ee787; font-family: monospace; }
19+
a { color: #58a6ff; }
20+
</style>
21+
</head>
22+
<body>
23+
<h1>str.get_dummies — multi-label string encoding</h1>
24+
<p>
25+
Port of <code>pandas.Series.str.get_dummies(sep)</code>. Splits each
26+
string by a separator (default <code>"|"</code>) and returns a
27+
<code>DataFrame</code> of binary indicator columns — one per unique token,
28+
sorted lexicographically. <code>null</code> / <code>undefined</code> /
29+
<code>NaN</code> values produce a row of all zeros.
30+
</p>
31+
<p><a href="./index.html">← back to index</a></p>
32+
33+
<div class="card">
34+
<h2>Example 1 — basic split on <code>|</code></h2>
35+
<textarea id="ex1-code">
36+
const { Series, strGetDummies } = tsb;
37+
const s = new Series({ data: ["a|b", "b|c", "a"], name: "tags" });
38+
const df = strGetDummies(s);
39+
console.log(JSON.stringify(df.toRecords(), null, 2));
40+
console.log("columns =", df.columns.values.join(", "));
41+
</textarea>
42+
<button onclick="run('ex1')">Run</button>
43+
<pre id="ex1-out">(click Run)</pre>
44+
</div>
45+
46+
<div class="card">
47+
<h2>Example 2 — custom separator</h2>
48+
<textarea id="ex2-code">
49+
const { Series, strGetDummies } = tsb;
50+
const s = new Series({ data: ["red,green", "green,blue", "red"] });
51+
const df = strGetDummies(s, { sep: "," });
52+
console.log(JSON.stringify(df.toRecords(), null, 2));
53+
</textarea>
54+
<button onclick="run('ex2')">Run</button>
55+
<pre id="ex2-out">(click Run)</pre>
56+
</div>
57+
58+
<div class="card">
59+
<h2>Example 3 — null / undefined / NaN → all-zero rows</h2>
60+
<textarea id="ex3-code">
61+
const { Series, strGetDummies } = tsb;
62+
const s = new Series({ data: ["a|b", null, undefined, NaN, "b"] });
63+
const df = strGetDummies(s);
64+
console.log(JSON.stringify(df.toRecords(), null, 2));
65+
</textarea>
66+
<button onclick="run('ex3')">Run</button>
67+
<pre id="ex3-out">(click Run)</pre>
68+
</div>
69+
70+
<div class="card">
71+
<h2>Example 4 — preserved Series index</h2>
72+
<textarea id="ex4-code">
73+
const { Series, strGetDummies } = tsb;
74+
const s = new Series({ data: ["python|pandas", "python|numpy", "pandas|numpy|scipy"], index: ["row-1", "row-2", "row-3"] });
75+
const df = strGetDummies(s);
76+
console.log("index =", df.index.values.join(", "));
77+
console.log(JSON.stringify(df.toRecords(), null, 2));
78+
</textarea>
79+
<button onclick="run('ex4')">Run</button>
80+
<pre id="ex4-out">(click Run)</pre>
81+
</div>
82+
83+
<script type="module">
84+
let tsb;
85+
try {
86+
tsb = await import("../src/index.ts");
87+
} catch {
88+
tsb = await import("https://esm.sh/tsb@latest");
89+
}
90+
window.tsb = tsb;
91+
92+
window.run = function run(id) {
93+
const code = document.getElementById(`${id}-code`).value;
94+
const out = document.getElementById(`${id}-out`);
95+
const logs = [];
96+
const origLog = console.log;
97+
console.log = (...args) => logs.push(args.map(String).join(" "));
98+
try {
99+
new Function("tsb", code)(tsb);
100+
out.textContent = logs.join("\n") || "(no output)";
101+
} catch (e) {
102+
out.textContent = "Error: " + e.message;
103+
} finally {
104+
console.log = origLog;
105+
}
106+
};
107+
</script>
108+
</body>
109+
</html>

src/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,6 @@ export {
386386
} from "./core/index.ts";
387387
export {
388388
strNormalize,
389-
strGetDummies,
390389
strExtractAll,
391390
strRemovePrefix,
392391
strRemoveSuffix,
@@ -401,6 +400,8 @@ export {
401400
strIndent,
402401
strDedent,
403402
} from "./stats/index.ts";
403+
export { strGetDummies } from "./stats/index.ts";
404+
export type { StrGetDummiesOptions } from "./stats/index.ts";
404405
export type {
405406
NormalizeForm,
406407
StrInput,

src/stats/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,6 @@ export { fillna, countna, countValid } from "./notna_isna.ts";
203203
export type { IsnaInput, FillnaOptions, DropnaOptions } from "./notna_isna.ts";
204204
export {
205205
strNormalize,
206-
strGetDummies,
207206
strExtractAll,
208207
strRemovePrefix,
209208
strRemoveSuffix,
@@ -212,6 +211,8 @@ export {
212211
strByteLength,
213212
} from "./string_ops.ts";
214213
export type { NormalizeForm, StrInput, ExtractAllOptions } from "./string_ops.ts";
214+
export { strGetDummies } from "./str_get_dummies.ts";
215+
export type { StrGetDummiesOptions } from "./str_get_dummies.ts";
215216
export {
216217
strSplitExpand,
217218
strExtractGroups,

src/stats/str_get_dummies.ts

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/**
2+
* str_get_dummies — split string Series by separator and return a DataFrame of
3+
* binary dummy/indicator variables.
4+
*
5+
* Mirrors `pandas.Series.str.get_dummies(sep='|')`.
6+
*
7+
* Each element is split by `sep`; the unique tokens across all elements become
8+
* columns. A cell is **1** if the token appeared in that row, **0** otherwise.
9+
* Missing values (`null` / `undefined` / `NaN`) contribute no tokens and
10+
* produce a row of all zeros. Columns are sorted lexicographically and the
11+
* original Series index is preserved on the returned DataFrame.
12+
*
13+
* @example
14+
* ```ts
15+
* import { Series, strGetDummies } from "tsb";
16+
*
17+
* const s = new Series({ data: ["a|b", "b|c", "a"], name: "flags" });
18+
* const df = strGetDummies(s);
19+
* // DataFrame:
20+
* // a b c
21+
* // 0 1 1 0
22+
* // 1 0 1 1
23+
* // 2 1 0 0
24+
* ```
25+
*
26+
* @module
27+
*/
28+
29+
import { DataFrame, Series } from "../core/index.ts";
30+
import type { Scalar } from "../types.ts";
31+
32+
// ─── Options ─────────────────────────────────────────────────────────────────
33+
34+
/** Options for {@link strGetDummies}. */
35+
export interface StrGetDummiesOptions {
36+
/**
37+
* Separator string used to split each element.
38+
* @default "|"
39+
*/
40+
readonly sep?: string;
41+
42+
/**
43+
* Optional prefix prepended to every column name.
44+
* @default ""
45+
*/
46+
readonly prefix?: string;
47+
48+
/**
49+
* Separator between the prefix and the token name.
50+
* @default "_"
51+
*/
52+
readonly prefixSep?: string;
53+
}
54+
55+
// ─── Implementation ───────────────────────────────────────────────────────────
56+
57+
/**
58+
* Split each string in `series` by `sep` and return a DataFrame of binary
59+
* dummy/indicator variables — one column per unique token.
60+
*
61+
* Mirrors `pandas.Series.str.get_dummies(sep)`.
62+
*
63+
* @param series A Series whose values are strings (or null/undefined/NaN).
64+
* @param options Options controlling the separator (default `"|"`).
65+
* @returns A DataFrame with the same index as `series` and integer
66+
* (`0`/`1`) columns — one per unique token, sorted
67+
* lexicographically.
68+
*
69+
* @example
70+
* ```ts
71+
* import { Series, strGetDummies } from "tsb";
72+
*
73+
* const s = new Series({ data: ["a|b", "b|c", null], name: "tags" });
74+
* const df = strGetDummies(s, { sep: "|" });
75+
* // a b c
76+
* // 0 1 1 0
77+
* // 1 0 1 1
78+
* // 2 0 0 0
79+
* ```
80+
*/
81+
export function strGetDummies(
82+
series: Series<Scalar>,
83+
options: StrGetDummiesOptions = {},
84+
): DataFrame {
85+
const sep = options.sep ?? "|";
86+
const prefix = options.prefix ?? "";
87+
const prefixSep = options.prefixSep ?? "_";
88+
const colName = (token: string): string =>
89+
prefix === "" ? token : `${prefix}${prefixSep}${token}`;
90+
const vals = series.values;
91+
const n = vals.length;
92+
93+
// Collect all unique tokens and per-row token sets.
94+
const tokenSet = new Set<string>();
95+
const rowTokens: Set<string>[] = new Array<Set<string>>(n);
96+
97+
for (let i = 0; i < n; i++) {
98+
const v = vals[i];
99+
const tokens = new Set<string>();
100+
if (v !== null && v !== undefined && !(typeof v === "number" && Number.isNaN(v))) {
101+
const str = typeof v === "string" ? v : String(v);
102+
if (str !== "") {
103+
for (const tok of str.split(sep)) {
104+
tokens.add(tok);
105+
tokenSet.add(tok);
106+
}
107+
}
108+
}
109+
rowTokens[i] = tokens;
110+
}
111+
112+
// Sort tokens lexicographically (pandas sorts columns for get_dummies).
113+
const columns = [...tokenSet].sort();
114+
115+
// Build one Series per column. Use a Map (rather than a plain object)
116+
// so that lexicographic order is preserved even for integer-like token
117+
// names (plain object keys re-order numeric strings).
118+
const idx = series.index;
119+
const colMap = new Map<string, Series<Scalar>>();
120+
for (const col of columns) {
121+
const arr: Scalar[] = new Array<Scalar>(n);
122+
for (let i = 0; i < n; i++) {
123+
arr[i] = rowTokens[i]?.has(col) === true ? 1 : 0;
124+
}
125+
colMap.set(colName(col), new Series<Scalar>({ data: arr, index: idx }));
126+
}
127+
128+
return new DataFrame(colMap, idx);
129+
}

src/stats/string_ops.ts

Lines changed: 1 addition & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
* the accessor or better expressed as pure standalone utilities:
1111
*
1212
* - `strNormalize` — Unicode normalization (NFC / NFD / NFKC / NFKD)
13-
* - `strGetDummies` — split strings by delimiter → one-hot DataFrame
1413
* - `strExtractAll` — extract ALL regex matches per element
1514
* - `strRemovePrefix` — remove a leading prefix
1615
* - `strRemoveSuffix` — remove a trailing suffix
@@ -21,7 +20,7 @@
2120
* @module
2221
*/
2322

24-
import { DataFrame, Series } from "../core/index.ts";
23+
import { Series } from "../core/index.ts";
2524
import type { Scalar } from "../types.ts";
2625

2726
// ─── public types ─────────────────────────────────────────────────────────────
@@ -32,27 +31,6 @@ export type NormalizeForm = "NFC" | "NFD" | "NFKC" | "NFKD";
3231
/** Input accepted by all string-op functions. */
3332
export type StrInput = Series<Scalar> | readonly Scalar[] | readonly string[] | string;
3433

35-
/** Options for {@link strGetDummies}. */
36-
export interface StrGetDummiesOptions {
37-
/**
38-
* The delimiter used to split each element into tokens.
39-
* @default "|"
40-
*/
41-
readonly sep?: string;
42-
43-
/**
44-
* Prefix prepended to every column name in the output DataFrame.
45-
* @default ""
46-
*/
47-
readonly prefix?: string;
48-
49-
/**
50-
* Separator between the prefix and the token name.
51-
* @default "_"
52-
*/
53-
readonly prefixSep?: string;
54-
}
55-
5634
/** Options for {@link strExtractAll}. */
5735
export interface ExtractAllOptions {
5836
/**
@@ -138,70 +116,6 @@ export function strNormalize(
138116
return buildSeries(data, input);
139117
}
140118

141-
// ─── strGetDummies ────────────────────────────────────────────────────────────
142-
143-
/**
144-
* Encode each string element as a row in a one-hot DataFrame by splitting on a
145-
* delimiter.
146-
*
147-
* Mirrors `pandas.Series.str.get_dummies(sep)`.
148-
*
149-
* @param input - Series or string array.
150-
* @param options - `sep` (default `"|"`), `prefix` and `prefixSep` for column names.
151-
* @returns A `DataFrame` of 0/1 integer values, one column per unique token.
152-
*
153-
* @example
154-
* ```ts
155-
* const s = new Series({ data: ["a|b", "b|c", "a"] });
156-
* strGetDummies(s);
157-
* // DataFrame
158-
* // a b c
159-
* // 0 1 1 0
160-
* // 1 0 1 1
161-
* // 2 1 0 0
162-
* ```
163-
*/
164-
export function strGetDummies(
165-
input: readonly string[] | Series<Scalar>,
166-
options: StrGetDummiesOptions = {},
167-
): DataFrame {
168-
const sep = options.sep ?? "|";
169-
const prefix = options.prefix ?? "";
170-
const prefixSep = options.prefixSep ?? "_";
171-
172-
const strs = toStringArray(input);
173-
174-
// 1. Collect all unique tokens in first-seen order.
175-
const seen = new Set<string>();
176-
const tokenRows: string[][] = strs.map((s) => {
177-
const tokens = s === "" ? [] : s.split(sep);
178-
for (const t of tokens) {
179-
seen.add(t);
180-
}
181-
return tokens;
182-
});
183-
184-
const allTokens = [...seen].sort(); // stable alphabetical order
185-
186-
// 2. Build column name with optional prefix.
187-
const colName = (token: string): string =>
188-
prefix === "" ? token : `${prefix}${prefixSep}${token}`;
189-
190-
// 3. Build one Scalar[] per column.
191-
const columns: Record<string, Scalar[]> = {};
192-
for (const token of allTokens) {
193-
const name = colName(token);
194-
columns[name] = tokenRows.map((row) => (row.includes(token) ? 1 : 0));
195-
}
196-
197-
// 4. Preserve the row index from a Series input.
198-
if (input instanceof Series) {
199-
const rowIndex = input.index;
200-
return DataFrame.fromColumns(columns, { index: rowIndex });
201-
}
202-
return DataFrame.fromColumns(columns);
203-
}
204-
205119
// ─── strExtractAll ────────────────────────────────────────────────────────────
206120

207121
/**

0 commit comments

Comments
 (0)