perf: use lookup table in splitIntoPotentialTokens (#240)

xiaoxiaojx · claude · web-flow · commit 0872bcd7639b · 2026-06-26T16:43:12.000+03:00
Replace multi-comparison chains with a Uint8Array bitmask lookup per
character.  Phase 1 checked 4 conditions (cc !== 10/59/123/125) and
phase 2 checked 6 conditions (cc === 59/32/123/125/13/9) on every
character.  The lookup table reduces both to a single indexed read
plus one bitwise AND.

Benchmark results (ops/s, higher is better):

  splitIntoPotentialTokens fixture:   533 → 574  (+7.7%)
  splitIntoPotentialTokens big:       392 → 474  (+20.9%)
  original-source streamChunks():     904 → 983  (+8.7%)
  original-source streamChunks(big):  334 → 419  (+25.4%)
  realistic cold sourceAndMap():      864 → 901  (+4.3%)

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.changeset/split-into-potential-tokens-lookup-table.md b/.changeset/split-into-potential-tokens-lookup-table.md
@@ -0,0 +1,7 @@
+---
+"webpack-sources": patch
+---
+
+perf: use lookup table in splitIntoPotentialTokens for faster character classification
+
+Replace multi-comparison chains (4 comparisons in phase 1, 6 in phase 2) with a single Uint8Array bitmask lookup per character. This reduces per-character branching overhead, yielding ~7% improvement on typical source and ~21% on large sources.
diff --git a/lib/helpers/splitIntoPotentialTokens.js b/lib/helpers/splitIntoPotentialTokens.js
@@ -5,13 +5,29 @@
 
 "use strict";
 
-// \n = 10
-// ; = 59
-// { = 123
-// } = 125
-// <space> = 32
-// \r = 13
-// \t = 9
+// Character classification via a lookup table.  A single bitmask test
+// replaces the multi-comparison chains in each inner loop phase.
+//
+// BIT layout per character:
+//   bit 0 (STOP1 = 1): stops phase-1 scan  (\n ; { })
+//   bit 1 (CONT2 = 2): continues phase-2 scan  (; { } space \r \t)
+//
+// Phase 1: scan regular source chars that are NOT a phase-1 stop.
+// Phase 2: consume runs of statement-boundary / whitespace chars.
+// Phase 3: consume a trailing \n if present.
+
+const STOP1 = 1;
+const CONT2 = 2;
+
+/** @type {Uint8Array} */
+const CF = new Uint8Array(128);
+CF[10] = STOP1; // \n  – stops phase 1, NOT consumed in phase 2
+CF[59] = STOP1 | CONT2; // ;
+CF[123] = STOP1 | CONT2; // {
+CF[125] = STOP1 | CONT2; // }
+CF[32] = CONT2; // space
+CF[13] = CONT2; // \r
+CF[9] = CONT2; // \t
 
 /**
  * @param {string} str string
@@ -22,28 +38,28 @@ const splitIntoPotentialTokens = (str) => {
 	if (len === 0) return null;
 	const results = [];
 	let i = 0;
-	while (i < len) {
+	outer: while (i < len) {
 		const start = i;
-		block: {
-			let cc = str.charCodeAt(i);
-			while (cc !== 10 && cc !== 59 && cc !== 123 && cc !== 125) {
-				if (++i >= len) break block;
-				cc = str.charCodeAt(i);
-			}
-			while (
-				cc === 59 ||
-				cc === 32 ||
-				cc === 123 ||
-				cc === 125 ||
-				cc === 13 ||
-				cc === 9
-			) {
-				if (++i >= len) break block;
-				cc = str.charCodeAt(i);
+		// Phase 1 – skip regular (non-stop) characters
+		let cc = str.charCodeAt(i);
+		while (cc > 127 || !(CF[cc] & STOP1)) {
+			if (++i >= len) {
+				results.push(str.slice(start, i));
+				break outer;
 			}
-			if (cc === 10) {
-				i++;
+			cc = str.charCodeAt(i);
+		}
+		// Phase 2 – consume delimiter / whitespace run (; { } space \r \t)
+		while (cc < 128 && CF[cc] & CONT2) {
+			if (++i >= len) {
+				results.push(str.slice(start, i));
+				break outer;
 			}
+			cc = str.charCodeAt(i);
+		}
+		// Phase 3 – consume trailing newline
+		if (cc === 10) {
+			i++;
 		}
 		results.push(str.slice(start, i));
 	}