Merge pull request #227 from githubnext/copilot/autoloopport-str-get-dummies

mrjf · web-flow · commit f3a33d6a2ef4 · 2026-04-27T08:08:39.000-07:00
Extract `strGetDummies` into a dedicated module
diff --git a/playground/str_get_dummies.html b/playground/str_get_dummies.html
@@ -0,0 +1,109 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>tsb — str.get_dummies: multi-label string encoding</title>
+    <style>
+      * { box-sizing: border-box; margin: 0; padding: 0; }
+      body { font-family: system-ui, sans-serif; background: #0d1117; color: #c9d1d9; line-height: 1.6; padding: 2rem; }
+      h1 { color: #58a6ff; font-size: 1.8rem; margin-bottom: .5rem; }
+      h2 { color: #79c0ff; font-size: 1.2rem; margin: 2rem 0 .75rem; }
+      p  { color: #8b949e; margin-bottom: 1rem; max-width: 800px; }
+      code { background: #161b22; padding: .1rem .4rem; border-radius: 4px; font-family: monospace; font-size: .9em; color: #a5d6ff; }
+      .card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; max-width: 900px; }
+      textarea { width: 100%; background: #0d1117; border: 1px solid #30363d; border-radius: 6px; color: #c9d1d9; font-family: monospace; font-size: .85rem; padding: .75rem; resize: vertical; min-height: 140px; }
+      button { background: #238636; color: #fff; border: none; border-radius: 6px; padding: .5rem 1.25rem; cursor: pointer; font-size: .9rem; margin-top: .75rem; }
+      button:hover { background: #2ea043; }
+      pre { background: #0d1117; border: 1px solid #21262d; border-radius: 6px; padding: 0.75rem 1rem; overflow-x: auto; font-size: 0.85rem; white-space: pre-wrap; margin-top: 0.5rem; color: #7ee787; font-family: monospace; }
+      a { color: #58a6ff; }
+    </style>
+  </head>
+  <body>
+    <h1>str.get_dummies — multi-label string encoding</h1>
+    <p>
+      Port of <code>pandas.Series.str.get_dummies(sep)</code>.  Splits each
+      string by a separator (default <code>"|"</code>) and returns a
+      <code>DataFrame</code> of binary indicator columns — one per unique token,
+      sorted lexicographically.  <code>null</code> / <code>undefined</code> /
+      <code>NaN</code> values produce a row of all zeros.
+    </p>
+    <p><a href="./index.html">← back to index</a></p>
+
+    <div class="card">
+      <h2>Example 1 — basic split on <code>|</code></h2>
+      <textarea id="ex1-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["a|b", "b|c", "a"], name: "tags" });
+const df = strGetDummies(s);
+console.log(JSON.stringify(df.toRecords(), null, 2));
+console.log("columns =", df.columns.values.join(", "));
+</textarea>
+      <button onclick="run('ex1')">Run</button>
+      <pre id="ex1-out">(click Run)</pre>
+    </div>
+
+    <div class="card">
+      <h2>Example 2 — custom separator</h2>
+      <textarea id="ex2-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["red,green", "green,blue", "red"] });
+const df = strGetDummies(s, { sep: "," });
+console.log(JSON.stringify(df.toRecords(), null, 2));
+</textarea>
+      <button onclick="run('ex2')">Run</button>
+      <pre id="ex2-out">(click Run)</pre>
+    </div>
+
+    <div class="card">
+      <h2>Example 3 — null / undefined / NaN → all-zero rows</h2>
+      <textarea id="ex3-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["a|b", null, undefined, NaN, "b"] });
+const df = strGetDummies(s);
+console.log(JSON.stringify(df.toRecords(), null, 2));
+</textarea>
+      <button onclick="run('ex3')">Run</button>
+      <pre id="ex3-out">(click Run)</pre>
+    </div>
+
+    <div class="card">
+      <h2>Example 4 — preserved Series index</h2>
+      <textarea id="ex4-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["python|pandas", "python|numpy", "pandas|numpy|scipy"], index: ["row-1", "row-2", "row-3"] });
+const df = strGetDummies(s);
+console.log("index =", df.index.values.join(", "));
+console.log(JSON.stringify(df.toRecords(), null, 2));
+</textarea>
+      <button onclick="run('ex4')">Run</button>
+      <pre id="ex4-out">(click Run)</pre>
+    </div>
+
+    <script type="module">
+      let tsb;
+      try {
+        tsb = await import("../src/index.ts");
+      } catch {
+        tsb = await import("https://esm.sh/tsb@latest");
+      }
+      window.tsb = tsb;
+
+      window.run = function run(id) {
+        const code = document.getElementById(`${id}-code`).value;
+        const out = document.getElementById(`${id}-out`);
+        const logs = [];
+        const origLog = console.log;
+        console.log = (...args) => logs.push(args.map(String).join(" "));
+        try {
+          new Function("tsb", code)(tsb);
+          out.textContent = logs.join("\n") || "(no output)";
+        } catch (e) {
+          out.textContent = "Error: " + e.message;
+        } finally {
+          console.log = origLog;
+        }
+      };
+    </script>
+  </body>
+</html>
diff --git a/src/index.ts b/src/index.ts
@@ -386,7 +386,6 @@ export {
 } from "./core/index.ts";
 export {
   strNormalize,
-  strGetDummies,
   strExtractAll,
   strRemovePrefix,
   strRemoveSuffix,
@@ -401,6 +400,8 @@ export {
   strIndent,
   strDedent,
 } from "./stats/index.ts";
+export { strGetDummies } from "./stats/index.ts";
+export type { StrGetDummiesOptions } from "./stats/index.ts";
 export type {
   NormalizeForm,
   StrInput,
diff --git a/src/stats/index.ts b/src/stats/index.ts
@@ -203,7 +203,6 @@ export { fillna, countna, countValid } from "./notna_isna.ts";
 export type { IsnaInput, FillnaOptions, DropnaOptions } from "./notna_isna.ts";
 export {
   strNormalize,
-  strGetDummies,
   strExtractAll,
   strRemovePrefix,
   strRemoveSuffix,
@@ -212,6 +211,8 @@ export {
   strByteLength,
 } from "./string_ops.ts";
 export type { NormalizeForm, StrInput, ExtractAllOptions } from "./string_ops.ts";
+export { strGetDummies } from "./str_get_dummies.ts";
+export type { StrGetDummiesOptions } from "./str_get_dummies.ts";
 export {
   strSplitExpand,
   strExtractGroups,
diff --git a/src/stats/str_get_dummies.ts b/src/stats/str_get_dummies.ts
@@ -0,0 +1,129 @@
+/**
+ * str_get_dummies — split string Series by separator and return a DataFrame of
+ * binary dummy/indicator variables.
+ *
+ * Mirrors `pandas.Series.str.get_dummies(sep='|')`.
+ *
+ * Each element is split by `sep`; the unique tokens across all elements become
+ * columns.  A cell is **1** if the token appeared in that row, **0** otherwise.
+ * Missing values (`null` / `undefined` / `NaN`) contribute no tokens and
+ * produce a row of all zeros.  Columns are sorted lexicographically and the
+ * original Series index is preserved on the returned DataFrame.
+ *
+ * @example
+ * ```ts
+ * import { Series, strGetDummies } from "tsb";
+ *
+ * const s = new Series({ data: ["a|b", "b|c", "a"], name: "flags" });
+ * const df = strGetDummies(s);
+ * // DataFrame:
+ * //    a  b  c
+ * // 0  1  1  0
+ * // 1  0  1  1
+ * // 2  1  0  0
+ * ```
+ *
+ * @module
+ */
+
+import { DataFrame, Series } from "../core/index.ts";
+import type { Scalar } from "../types.ts";
+
+// ─── Options ─────────────────────────────────────────────────────────────────
+
+/** Options for {@link strGetDummies}. */
+export interface StrGetDummiesOptions {
+  /**
+   * Separator string used to split each element.
+   * @default "|"
+   */
+  readonly sep?: string;
+
+  /**
+   * Optional prefix prepended to every column name.
+   * @default ""
+   */
+  readonly prefix?: string;
+
+  /**
+   * Separator between the prefix and the token name.
+   * @default "_"
+   */
+  readonly prefixSep?: string;
+}
+
+// ─── Implementation ───────────────────────────────────────────────────────────
+
+/**
+ * Split each string in `series` by `sep` and return a DataFrame of binary
+ * dummy/indicator variables — one column per unique token.
+ *
+ * Mirrors `pandas.Series.str.get_dummies(sep)`.
+ *
+ * @param series  A Series whose values are strings (or null/undefined/NaN).
+ * @param options Options controlling the separator (default `"|"`).
+ * @returns       A DataFrame with the same index as `series` and integer
+ *                (`0`/`1`) columns — one per unique token, sorted
+ *                lexicographically.
+ *
+ * @example
+ * ```ts
+ * import { Series, strGetDummies } from "tsb";
+ *
+ * const s = new Series({ data: ["a|b", "b|c", null], name: "tags" });
+ * const df = strGetDummies(s, { sep: "|" });
+ * //    a  b  c
+ * // 0  1  1  0
+ * // 1  0  1  1
+ * // 2  0  0  0
+ * ```
+ */
+export function strGetDummies(
+  series: Series<Scalar>,
+  options: StrGetDummiesOptions = {},
+): DataFrame {
+  const sep = options.sep ?? "|";
+  const prefix = options.prefix ?? "";
+  const prefixSep = options.prefixSep ?? "_";
+  const colName = (token: string): string =>
+    prefix === "" ? token : `${prefix}${prefixSep}${token}`;
+  const vals = series.values;
+  const n = vals.length;
+
+  // Collect all unique tokens and per-row token sets.
+  const tokenSet = new Set<string>();
+  const rowTokens: Set<string>[] = new Array<Set<string>>(n);
+
+  for (let i = 0; i < n; i++) {
+    const v = vals[i];
+    const tokens = new Set<string>();
+    if (v !== null && v !== undefined && !(typeof v === "number" && Number.isNaN(v))) {
+      const str = typeof v === "string" ? v : String(v);
+      if (str !== "") {
+        for (const tok of str.split(sep)) {
+          tokens.add(tok);
+          tokenSet.add(tok);
+        }
+      }
+    }
+    rowTokens[i] = tokens;
+  }
+
+  // Sort tokens lexicographically (pandas sorts columns for get_dummies).
+  const columns = [...tokenSet].sort();
+
+  // Build one Series per column.  Use a Map (rather than a plain object)
+  // so that lexicographic order is preserved even for integer-like token
+  // names (plain object keys re-order numeric strings).
+  const idx = series.index;
+  const colMap = new Map<string, Series<Scalar>>();
+  for (const col of columns) {
+    const arr: Scalar[] = new Array<Scalar>(n);
+    for (let i = 0; i < n; i++) {
+      arr[i] = rowTokens[i]?.has(col) === true ? 1 : 0;
+    }
+    colMap.set(colName(col), new Series<Scalar>({ data: arr, index: idx }));
+  }
+
+  return new DataFrame(colMap, idx);
+}
diff --git a/src/stats/string_ops.ts b/src/stats/string_ops.ts
@@ -10,7 +10,6 @@
  * the accessor or better expressed as pure standalone utilities:
  *
  * - `strNormalize`  — Unicode normalization (NFC / NFD / NFKC / NFKD)
- * - `strGetDummies` — split strings by delimiter → one-hot DataFrame
  * - `strExtractAll` — extract ALL regex matches per element
  * - `strRemovePrefix` — remove a leading prefix
  * - `strRemoveSuffix` — remove a trailing suffix
@@ -21,7 +20,7 @@
  * @module
  */
 
-import { DataFrame, Series } from "../core/index.ts";
+import { Series } from "../core/index.ts";
 import type { Scalar } from "../types.ts";
 
 // ─── public types ─────────────────────────────────────────────────────────────
@@ -32,27 +31,6 @@ export type NormalizeForm = "NFC" | "NFD" | "NFKC" | "NFKD";
 /** Input accepted by all string-op functions. */
 export type StrInput = Series<Scalar> | readonly Scalar[] | readonly string[] | string;
 
-/** Options for {@link strGetDummies}. */
-export interface StrGetDummiesOptions {
-  /**
-   * The delimiter used to split each element into tokens.
-   * @default "|"
-   */
-  readonly sep?: string;
-
-  /**
-   * Prefix prepended to every column name in the output DataFrame.
-   * @default ""
-   */
-  readonly prefix?: string;
-
-  /**
-   * Separator between the prefix and the token name.
-   * @default "_"
-   */
-  readonly prefixSep?: string;
-}
-
 /** Options for {@link strExtractAll}. */
 export interface ExtractAllOptions {
   /**
@@ -138,70 +116,6 @@ export function strNormalize(
   return buildSeries(data, input);
 }
 
-// ─── strGetDummies ────────────────────────────────────────────────────────────
-
-/**
- * Encode each string element as a row in a one-hot DataFrame by splitting on a
- * delimiter.
- *
- * Mirrors `pandas.Series.str.get_dummies(sep)`.
- *
- * @param input   - Series or string array.
- * @param options - `sep` (default `"|"`), `prefix` and `prefixSep` for column names.
- * @returns A `DataFrame` of 0/1 integer values, one column per unique token.
- *
- * @example
- * ```ts
- * const s = new Series({ data: ["a|b", "b|c", "a"] });
- * strGetDummies(s);
- * // DataFrame
- * //    a  b  c
- * // 0  1  1  0
- * // 1  0  1  1
- * // 2  1  0  0
- * ```
- */
-export function strGetDummies(
-  input: readonly string[] | Series<Scalar>,
-  options: StrGetDummiesOptions = {},
-): DataFrame {
-  const sep = options.sep ?? "|";
-  const prefix = options.prefix ?? "";
-  const prefixSep = options.prefixSep ?? "_";
-
-  const strs = toStringArray(input);
-
-  // 1. Collect all unique tokens in first-seen order.
-  const seen = new Set<string>();
-  const tokenRows: string[][] = strs.map((s) => {
-    const tokens = s === "" ? [] : s.split(sep);
-    for (const t of tokens) {
-      seen.add(t);
-    }
-    return tokens;
-  });
-
-  const allTokens = [...seen].sort(); // stable alphabetical order
-
-  // 2. Build column name with optional prefix.
-  const colName = (token: string): string =>
-    prefix === "" ? token : `${prefix}${prefixSep}${token}`;
-
-  // 3. Build one Scalar[] per column.
-  const columns: Record<string, Scalar[]> = {};
-  for (const token of allTokens) {
-    const name = colName(token);
-    columns[name] = tokenRows.map((row) => (row.includes(token) ? 1 : 0));
-  }
-
-  // 4. Preserve the row index from a Series input.
-  if (input instanceof Series) {
-    const rowIndex = input.index;
-    return DataFrame.fromColumns(columns, { index: rowIndex });
-  }
-  return DataFrame.fromColumns(columns);
-}
-
 // ─── strExtractAll ────────────────────────────────────────────────────────────
 
 /**
diff --git a/tests/stats/str_get_dummies.test.ts b/tests/stats/str_get_dummies.test.ts
diff --git a/tests/stats/string_ops.test.ts b/tests/stats/string_ops.test.ts