diff --git a/.changeset/slick-frogs-happen.md b/.changeset/slick-frogs-happen.md new file mode 100644 index 00000000..23f45732 --- /dev/null +++ b/.changeset/slick-frogs-happen.md @@ -0,0 +1,7 @@ +--- +"just-bash": patch +--- + +perf(grep): up to 14.5× speedup via preFilter extensions and matcher reuse. + +Anchored alternation patterns like `^def \|^async def` now extract literal needles (stripping outer `^`/`$`), enabling the `String.indexOf` fast-path. Files with no matching needle are rejected before `split("\n")`, skipping RE2 entirely. `acquireMatcher()` extended to `match()`, `replace()`, `search()`, and `matchAll()` to reduce GC pressure across awk/sed hot-paths. \ No newline at end of file diff --git a/packages/just-bash/src/commands/grep/grep.ts b/packages/just-bash/src/commands/grep/grep.ts index 6a112fd2..71b10633 100644 --- a/packages/just-bash/src/commands/grep/grep.ts +++ b/packages/just-bash/src/commands/grep/grep.ts @@ -358,6 +358,32 @@ export const grepCommand: Command = { } const content = await ctx.fs.readFile(filePath); + + // File-level preFilter: skip searchContent entirely when no needle exists in file. + // Avoids content.split("\n") and all per-line work for the common zero-match case. + if (preFilter && !invertMatch) { + const haystack = preFilter.ignoreCase + ? content.toLowerCase() + : content; + if (!preFilter.needles.some((n) => haystack.includes(n))) { + if (countOnly) { + const countStr = showFilename ? `${file}:0` : "0"; + return { + file, + result: { + output: `${countStr}\n`, + matched: false, + matchCount: 0, + }, + }; + } + return { + file, + result: { output: "", matched: false, matchCount: 0 }, + }; + } + } + const result = searchContent(content, regex, { invertMatch, showLineNumbers, diff --git a/packages/just-bash/src/commands/search-engine/matcher.test.ts b/packages/just-bash/src/commands/search-engine/matcher.test.ts new file mode 100644 index 00000000..75cdc782 --- /dev/null +++ b/packages/just-bash/src/commands/search-engine/matcher.test.ts @@ -0,0 +1,153 @@ +import { describe, expect, it } from "vitest"; +import { searchContent } from "./matcher.js"; +import { buildRegex, type PreFilter } from "./regex.js"; + +describe("preFilterMatches — substring fast-path", () => { + it("skips lines where no needle is present (case-sensitive)", () => { + const content = "hello world\nfoo bar\nhello foo\n"; + const { regex } = buildRegex("foo", { mode: "basic" }); + const preFilter: PreFilter = { needles: ["foo"], ignoreCase: false }; + const result = searchContent(content, regex, { preFilter }); + expect(result.output).toBe("foo bar\nhello foo\n"); + expect(result.matchCount).toBe(2); + }); + + it("lowercases both needle and line when ignoreCase=true", () => { + const content = "FOO\nfoo\nbar\n"; + const { regex } = buildRegex("foo", { mode: "basic", ignoreCase: true }); + const preFilter: PreFilter = { needles: ["foo"], ignoreCase: true }; + const result = searchContent(content, regex, { preFilter }); + expect(result.output).toBe("FOO\nfoo\n"); + expect(result.matchCount).toBe(2); + }); + + it("passes a line when any of multiple needles matches (OR logic)", () => { + const content = "alpha\nbeta\ngamma\ndelta\n"; + const { regex } = buildRegex("alpha\\|delta", { mode: "basic" }); + const preFilter: PreFilter = { + needles: ["alpha", "delta"], + ignoreCase: false, + }; + const result = searchContent(content, regex, { preFilter }); + expect(result.output).toBe("alpha\ndelta\n"); + expect(result.matchCount).toBe(2); + }); + + it("outputs non-needle lines under invertMatch (no unsafe skip)", () => { + // "bar" and "baz" contain no needle — preFilter sets firstMatch=null → no + // regex match → with invertMatch those lines ARE output. The fast-path + // is still correct because a line without the needle provably can't match. + const content = "foo\nbar\nbaz\n"; + const { regex } = buildRegex("foo", { mode: "basic" }); + const preFilter: PreFilter = { needles: ["foo"], ignoreCase: false }; + const result = searchContent(content, regex, { + preFilter, + invertMatch: true, + }); + expect(result.output).toBe("bar\nbaz\n"); + expect(result.matchCount).toBe(2); + }); + + it("does not apply the fast-path skip when preFilter is absent", () => { + const content = "alpha\nbeta\n"; + const { regex } = buildRegex("alpha", { mode: "basic" }); + const result = searchContent(content, regex, { preFilter: null }); + expect(result.output).toBe("alpha\n"); + expect(result.matchCount).toBe(1); + }); +}); + +describe("applyReplacement — replacement token substitution", () => { + it("substitutes $& with the full match text", () => { + const { regex } = buildRegex("foo", { mode: "basic" }); + const result = searchContent("foo bar\n", regex, { + replace: "[$&]", + onlyMatching: true, + }); + expect(result.output).toBe("[foo]\n"); + }); + + it("substitutes $1 and $2 with numbered capture groups", () => { + const { regex } = buildRegex("(\\w+)@(\\w+)", { mode: "extended" }); + const result = searchContent("user@host\n", regex, { + replace: "$2/$1", + onlyMatching: true, + }); + expect(result.output).toBe("host/user\n"); + }); + + it("substitutes $ with named capture groups", () => { + const { regex } = buildRegex("(?P\\w+)@(?P\\w+)", { + mode: "perl", + }); + const result = searchContent("alice@example\n", regex, { + replace: "$/$", + onlyMatching: true, + }); + expect(result.output).toBe("example/alice\n"); + }); + + it("uses empty string for a missing capture group reference", () => { + const { regex } = buildRegex("(foo)(bar)?", { mode: "extended" }); + const result = searchContent("foo\n", regex, { + replace: "$1-$2", + onlyMatching: true, + }); + expect(result.output).toBe("foo-\n"); + }); + + it("applies replacement inline on the full matching line", () => { + const { regex } = buildRegex("world", { mode: "basic" }); + const result = searchContent("hello world\n", regex, { + replace: "WORLD", + }); + expect(result.output).toBe("hello WORLD\n"); + }); +}); + +describe("searchContentMultiline — file-level preFilter", () => { + it("returns empty result immediately when no needle in content", () => { + const content = Array.from({ length: 500 }, (_, i) => `line ${i}`).join( + "\n", + ); + const { regex, preFilter } = buildRegex("^def \\|^async def ", { + mode: "basic", + }); + const result = searchContent(content, regex, { + multiline: true, + preFilter, + }); + expect(result.matched).toBe(false); + expect(result.output).toBe(""); + }); + + it("finds matches normally when needle present in content", () => { + const content = "class Foo:\n pass\ndef bar():\n pass\n"; + // multiline: true adds the m flag so ^ matches at each line boundary + const { regex, preFilter } = buildRegex("^def \\|^class ", { + mode: "basic", + multiline: true, + }); + const result = searchContent(content, regex, { + multiline: true, + preFilter, + showLineNumbers: true, + }); + expect(result.matched).toBe(true); + expect(result.output).toBe("1:class Foo:\n--\n3:def bar():\n"); + }); + + it("does NOT skip when invertMatch=true even if needle absent", () => { + const content = "hello\nworld\n"; + const { regex, preFilter } = buildRegex("^def ", { mode: "basic" }); + const result = searchContent(content, regex, { + multiline: true, + preFilter, + invertMatch: true, + showLineNumbers: true, + }); + // All lines match because none contain "def " at line start + expect(result.matched).toBe(true); + expect(result.output).toBe("1:hello\n2:world\n"); + }); +}); diff --git a/packages/just-bash/src/commands/search-engine/matcher.ts b/packages/just-bash/src/commands/search-engine/matcher.ts index 036bfdd2..136657d9 100644 --- a/packages/just-bash/src/commands/search-engine/matcher.ts +++ b/packages/just-bash/src/commands/search-engine/matcher.ts @@ -148,6 +148,7 @@ export function searchContent( showByteOffset, replace, kResetGroup, + preFilter, }); } @@ -465,6 +466,7 @@ function searchContentMultiline( showByteOffset: boolean; replace: string | null; kResetGroup?: number; + preFilter?: PreFilter | null; }, ): SearchResult { const { @@ -482,8 +484,19 @@ function searchContentMultiline( showByteOffset, replace, kResetGroup, + preFilter, } = options; + // File-level preFilter: if no needle appears anywhere in the content, no line can match. + // Only safe when not inverting — an invert-match scan must check every line. + if (preFilter && !invertMatch && !preFilterMatches(preFilter, content)) { + if (countOnly || countMatches) { + const countStr = filename ? `${filename}:0` : "0"; + return { output: `${countStr}\n`, matched: false, matchCount: 0 }; + } + return { output: "", matched: false, matchCount: 0 }; + } + const lines = content.split("\n"); const lineCount = lines.length; const lastIdx = diff --git a/packages/just-bash/src/commands/search-engine/regex.test.ts b/packages/just-bash/src/commands/search-engine/regex.test.ts index 77fb1696..0fdc565e 100644 --- a/packages/just-bash/src/commands/search-engine/regex.test.ts +++ b/packages/just-bash/src/commands/search-engine/regex.test.ts @@ -66,6 +66,47 @@ describe("buildRegex preFilter — happy path", () => { ignoreCase: false, }); }); + + it("extracts needle from leading-anchored literal ^foo", () => { + expect(buildRegex("^foo", { mode: "extended" }).preFilter).toEqual({ + needles: ["foo"], + ignoreCase: false, + }); + }); + + it("extracts needle from trailing-anchored literal foo$", () => { + expect(buildRegex("foo$", { mode: "extended" }).preFilter).toEqual({ + needles: ["foo"], + ignoreCase: false, + }); + }); + + it("extracts needle from fully-anchored literal ^foo$", () => { + expect(buildRegex("^foo$", { mode: "extended" }).preFilter).toEqual({ + needles: ["foo"], + ignoreCase: false, + }); + }); + + it("extracts needles from anchored alternation ^def |^async def (the issue case)", () => { + // BRE: \| is alternation. This is the canonical pattern from issue #89. + expect( + buildRegex("^def \\|^async def ", { mode: "basic" }).preFilter, + ).toEqual({ needles: ["def ", "async def "], ignoreCase: false }); + }); + + it("extracts needles from mixed anchored/unanchored alternation", () => { + expect(buildRegex("^foo|bar|baz$", { mode: "extended" }).preFilter).toEqual( + { needles: ["foo", "bar", "baz"], ignoreCase: false }, + ); + }); + + it("preserves literal $ when escaped (foo\\$ → 'foo$')", () => { + expect(buildRegex("foo\\$", { mode: "extended" }).preFilter).toEqual({ + needles: ["foo$"], + ignoreCase: false, + }); + }); }); describe("buildRegex preFilter — safety (must NOT extract)", () => { @@ -91,14 +132,6 @@ describe("buildRegex preFilter — safety (must NOT extract)", () => { expect(buildRegex("[abc]", { mode: "extended" }).preFilter).toBeUndefined(); }); - it("rejects start anchor ^", () => { - expect(buildRegex("^foo", { mode: "extended" }).preFilter).toBeUndefined(); - }); - - it("rejects end anchor $", () => { - expect(buildRegex("foo$", { mode: "extended" }).preFilter).toBeUndefined(); - }); - it("rejects . (matches any char)", () => { // basic mode: . is the dot meta in BRE, not escaped expect(buildRegex("a.b", { mode: "basic" }).preFilter).toBeUndefined(); @@ -156,6 +189,28 @@ describe("buildRegex preFilter — safety (must NOT extract)", () => { buildRegex("\\u2764", { mode: "extended" }).preFilter, ).toBeUndefined(); }); + + it("rejects bare ^ (anchor-only, no useful needle)", () => { + expect(buildRegex("^", { mode: "extended" }).preFilter).toBeUndefined(); + }); + + it("rejects bare $ (anchor-only, no useful needle)", () => { + expect(buildRegex("$", { mode: "extended" }).preFilter).toBeUndefined(); + }); + + it("rejects ^$ (line-boundary anchor pair, no needle)", () => { + expect(buildRegex("^$", { mode: "extended" }).preFilter).toBeUndefined(); + }); + + it("rejects ^a|$ (one branch has no needle)", () => { + expect(buildRegex("^a|$", { mode: "extended" }).preFilter).toBeUndefined(); + }); + + it("rejects mid-alternative ^ (literal ^ in middle is meta)", () => { + expect( + buildRegex("foo^bar", { mode: "extended" }).preFilter, + ).toBeUndefined(); + }); }); describe("buildRegex preFilter — structural correctness", () => { diff --git a/packages/just-bash/src/commands/search-engine/regex.ts b/packages/just-bash/src/commands/search-engine/regex.ts index 3af49490..315a1d96 100644 --- a/packages/just-bash/src/commands/search-engine/regex.ts +++ b/packages/just-bash/src/commands/search-engine/regex.ts @@ -334,11 +334,28 @@ function splitTopLevelAlternation(pattern: string): string[] | null { * something other than itself (quantifier, anchor, character class, group, dot). */ function literalFromAlternative(alt: string): string | null { + // Strip a leading unescaped ^ anchor. + let inner = alt; + if (inner.startsWith("^")) { + inner = inner.slice(1); + } + // Strip a trailing unescaped $ anchor. Walk back the run of trailing + // backslashes: $ is an anchor iff that run has even length. + if (inner.endsWith("$")) { + let bs = 0; + for (let i = inner.length - 2; i >= 0 && inner[i] === "\\"; i--) bs++; + if (bs % 2 === 0) { + inner = inner.slice(0, -1); + } + } + // Anchor-only alternative — no useful needle. + if (inner.length === 0) return null; + let out = ""; - for (let i = 0; i < alt.length; i++) { - const c = alt[i]; + for (let i = 0; i < inner.length; i++) { + const c = inner[i]; if (c === "\\") { - const next = alt[i + 1]; + const next = inner[i + 1]; if (next === undefined) return null; // Reject escapes that aren't simple literal substitutions. // \n, \t, \r are literal whitespace — fine. \d, \w, \s, \b, \B etc. diff --git a/packages/just-bash/src/regex/user-regex.test.ts b/packages/just-bash/src/regex/user-regex.test.ts index 0f557de7..310367f4 100644 --- a/packages/just-bash/src/regex/user-regex.test.ts +++ b/packages/just-bash/src/regex/user-regex.test.ts @@ -400,6 +400,52 @@ describe("RegexLike interface compatibility", () => { } }); +describe("acquireMatcher reuse — all methods", () => { + it("match() global reuses cached matcher", () => { + const re = new UserRegex("o+", "g"); + const r1 = re.match("foooo bar ooo"); + expect(r1).toEqual(["oooo", "ooo"]); + const r2 = re.match("bar baz"); + expect(r2).toBeNull(); + }); + + it("replace() string path returns correct result", () => { + const re = new UserRegex("foo", "g"); + expect(re.replace("foo bar foo", "baz")).toBe("baz bar baz"); + expect(re.replace("foo only once", "X")).toBe("X only once"); + }); + + it("replace() callback path returns correct result", () => { + const re = new UserRegex("(\\w+)", "g"); + const result = re.replace("hello world", (m) => m.toUpperCase()); + expect(result).toBe("HELLO WORLD"); + }); + + it("search() returns correct index", () => { + const re = new UserRegex("bar"); + expect(re.search("foo bar baz")).toBe(4); + expect(re.search("no match")).toBe(-1); + }); + + it("matchAll() yields all matches with groups", () => { + const re = new UserRegex("(\\d+)", "g"); + const matches = [...re.matchAll("a1 b22 c333")]; + expect(matches).toHaveLength(3); + expect(matches[0]?.[1]).toBe("1"); + expect(matches[1]?.[1]).toBe("22"); + expect(matches[2]?.[1]).toBe("333"); + }); + + it("sequential calls on same instance don't leak state", () => { + const re = new UserRegex("x", "g"); + for (let i = 0; i < 1000; i++) { + const r = re.match(i % 2 === 0 ? "x" : "y"); + if (i % 2 === 0) expect(r).toEqual(["x"]); + else expect(r).toBeNull(); + } + }); +}); + describe("edge cases", () => { describe("special regex characters in pattern", () => { it("handles escaped special chars", () => { diff --git a/packages/just-bash/src/regex/user-regex.ts b/packages/just-bash/src/regex/user-regex.ts index 6206cf64..6039af6c 100644 --- a/packages/just-bash/src/regex/user-regex.ts +++ b/packages/just-bash/src/regex/user-regex.ts @@ -241,7 +241,7 @@ export class UserRegex implements RegexLike { // Global: return all matches without groups const matches: string[] = []; - const matcher = this._re2.matcher(input); + const matcher = this.acquireMatcher(input); let pos = 0; while (matcher.find(pos)) { @@ -270,15 +270,20 @@ export class UserRegex implements RegexLike { } if (typeof replacement === "string") { - const matcher = this._re2.matcher(input); - // Use perlMode=true for JavaScript-style replacement ($1, $2, etc.) + const matcher = this.acquireMatcher(input); if (this._global) { return matcher.replaceAll(replacement, true); } return matcher.replaceFirst(replacement, true); } - // Callback replacement - we need to do this manually + // Callback replacement - we need to do this manually. + // Use a fresh Matcher rather than the shared cached one: the user-provided + // callback may re-enter this same UserRegex instance (e.g. call test/exec/ + // replace), which would route through acquireMatcher and repoint the shared + // matcher's charSequence to a different input. The next matcher.find(pos) + // would then advance through the wrong string. A fresh matcher keeps the + // iteration state private to this replace() call. const result: string[] = []; const matcher = this._re2.matcher(input); let lastEnd = 0; @@ -313,13 +318,18 @@ export class UserRegex implements RegexLike { args.push(groups); } - // Call replacement function + // Capture positions before invoking callback. The matcher is private to + // this call, but capturing now avoids relying on matcher state being + // unchanged across the callback boundary. + const matchStart = matcher.start(0); + const matchEnd = matcher.end(0); + result.push(replacement(fullMatch, ...args)); - lastEnd = matcher.end(0); + lastEnd = matchEnd; pos = lastEnd; // Handle zero-length matches - if (matcher.start(0) === matcher.end(0)) { + if (matchStart === matchEnd) { pos++; } @@ -355,7 +365,7 @@ export class UserRegex implements RegexLike { * Returns the index of the first match, or -1 if not found. */ search(input: string): number { - const matcher = this._re2.matcher(input); + const matcher = this.acquireMatcher(input); if (matcher.find()) { return matcher.start(0); } @@ -371,6 +381,10 @@ export class UserRegex implements RegexLike { } this._lastIndex = 0; + // matchAll is a generator that suspends at `yield`. The shared `_matcher` + // would be corrupted if a caller interleaves any other method on the same + // UserRegex instance between two `next()` calls (acquireMatcher would + // reset/repoint it). Use a fresh Matcher to keep iterator state private. const matcher = this._re2.matcher(input); const groupCount = this._re2.groupCount(); const namedGroups = this._re2.namedGroups();