Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/slick-frogs-happen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"just-bash": patch
---

perf(grep): up to 14.5× speedup via preFilter extensions and matcher reuse.

Anchored alternation patterns like `^def \|^async def` now extract literal needles (stripping outer `^`/`$`), enabling the `String.indexOf` fast-path. Files with no matching needle are rejected before `split("\n")`, skipping RE2 entirely. `acquireMatcher()` extended to `match()`, `replace()`, `search()`, and `matchAll()` to reduce GC pressure across awk/sed hot-paths.
26 changes: 26 additions & 0 deletions packages/just-bash/src/commands/grep/grep.ts
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,32 @@ export const grepCommand: Command = {
}

const content = await ctx.fs.readFile(filePath);

// File-level preFilter: skip searchContent entirely when no needle exists in file.
// Avoids content.split("\n") and all per-line work for the common zero-match case.
if (preFilter && !invertMatch) {
const haystack = preFilter.ignoreCase
? content.toLowerCase()
: content;
if (!preFilter.needles.some((n) => haystack.includes(n))) {
if (countOnly) {
const countStr = showFilename ? `${file}:0` : "0";
return {
file,
result: {
output: `${countStr}\n`,
matched: false,
matchCount: 0,
},
};
}
return {
file,
result: { output: "", matched: false, matchCount: 0 },
};
}
}

const result = searchContent(content, regex, {
invertMatch,
showLineNumbers,
Expand Down
153 changes: 153 additions & 0 deletions packages/just-bash/src/commands/search-engine/matcher.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import { describe, expect, it } from "vitest";
import { searchContent } from "./matcher.js";
import { buildRegex, type PreFilter } from "./regex.js";

describe("preFilterMatches — substring fast-path", () => {
it("skips lines where no needle is present (case-sensitive)", () => {
const content = "hello world\nfoo bar\nhello foo\n";
const { regex } = buildRegex("foo", { mode: "basic" });
const preFilter: PreFilter = { needles: ["foo"], ignoreCase: false };
const result = searchContent(content, regex, { preFilter });
expect(result.output).toBe("foo bar\nhello foo\n");
expect(result.matchCount).toBe(2);
});

it("lowercases both needle and line when ignoreCase=true", () => {
const content = "FOO\nfoo\nbar\n";
const { regex } = buildRegex("foo", { mode: "basic", ignoreCase: true });
const preFilter: PreFilter = { needles: ["foo"], ignoreCase: true };
const result = searchContent(content, regex, { preFilter });
expect(result.output).toBe("FOO\nfoo\n");
expect(result.matchCount).toBe(2);
});

it("passes a line when any of multiple needles matches (OR logic)", () => {
const content = "alpha\nbeta\ngamma\ndelta\n";
const { regex } = buildRegex("alpha\\|delta", { mode: "basic" });
const preFilter: PreFilter = {
needles: ["alpha", "delta"],
ignoreCase: false,
};
const result = searchContent(content, regex, { preFilter });
expect(result.output).toBe("alpha\ndelta\n");
expect(result.matchCount).toBe(2);
});

it("outputs non-needle lines under invertMatch (no unsafe skip)", () => {
// "bar" and "baz" contain no needle — preFilter sets firstMatch=null → no
// regex match → with invertMatch those lines ARE output. The fast-path
// is still correct because a line without the needle provably can't match.
const content = "foo\nbar\nbaz\n";
const { regex } = buildRegex("foo", { mode: "basic" });
const preFilter: PreFilter = { needles: ["foo"], ignoreCase: false };
const result = searchContent(content, regex, {
preFilter,
invertMatch: true,
});
expect(result.output).toBe("bar\nbaz\n");
expect(result.matchCount).toBe(2);
});

it("does not apply the fast-path skip when preFilter is absent", () => {
const content = "alpha\nbeta\n";
const { regex } = buildRegex("alpha", { mode: "basic" });
const result = searchContent(content, regex, { preFilter: null });
expect(result.output).toBe("alpha\n");
expect(result.matchCount).toBe(1);
});
});

describe("applyReplacement — replacement token substitution", () => {
it("substitutes $& with the full match text", () => {
const { regex } = buildRegex("foo", { mode: "basic" });
const result = searchContent("foo bar\n", regex, {
replace: "[$&]",
onlyMatching: true,
});
expect(result.output).toBe("[foo]\n");
});

it("substitutes $1 and $2 with numbered capture groups", () => {
const { regex } = buildRegex("(\\w+)@(\\w+)", { mode: "extended" });
const result = searchContent("user@host\n", regex, {
replace: "$2/$1",
onlyMatching: true,
});
expect(result.output).toBe("host/user\n");
});

it("substitutes $<name> with named capture groups", () => {
const { regex } = buildRegex("(?P<user>\\w+)@(?P<host>\\w+)", {
mode: "perl",
});
const result = searchContent("alice@example\n", regex, {
replace: "$<host>/$<user>",
onlyMatching: true,
});
expect(result.output).toBe("example/alice\n");
});

it("uses empty string for a missing capture group reference", () => {
const { regex } = buildRegex("(foo)(bar)?", { mode: "extended" });
const result = searchContent("foo\n", regex, {
replace: "$1-$2",
onlyMatching: true,
});
expect(result.output).toBe("foo-\n");
});

it("applies replacement inline on the full matching line", () => {
const { regex } = buildRegex("world", { mode: "basic" });
const result = searchContent("hello world\n", regex, {
replace: "WORLD",
});
expect(result.output).toBe("hello WORLD\n");
});
});

describe("searchContentMultiline — file-level preFilter", () => {
it("returns empty result immediately when no needle in content", () => {
const content = Array.from({ length: 500 }, (_, i) => `line ${i}`).join(
"\n",
);
const { regex, preFilter } = buildRegex("^def \\|^async def ", {
mode: "basic",
});
const result = searchContent(content, regex, {
multiline: true,
preFilter,
});
expect(result.matched).toBe(false);
expect(result.output).toBe("");
});

it("finds matches normally when needle present in content", () => {
const content = "class Foo:\n pass\ndef bar():\n pass\n";
// multiline: true adds the m flag so ^ matches at each line boundary
const { regex, preFilter } = buildRegex("^def \\|^class ", {
mode: "basic",
multiline: true,
});
const result = searchContent(content, regex, {
multiline: true,
preFilter,
showLineNumbers: true,
});
expect(result.matched).toBe(true);
expect(result.output).toBe("1:class Foo:\n--\n3:def bar():\n");
});

it("does NOT skip when invertMatch=true even if needle absent", () => {
const content = "hello\nworld\n";
const { regex, preFilter } = buildRegex("^def ", { mode: "basic" });
const result = searchContent(content, regex, {
multiline: true,
preFilter,
invertMatch: true,
showLineNumbers: true,
});
// All lines match because none contain "def " at line start
expect(result.matched).toBe(true);
expect(result.output).toBe("1:hello\n2:world\n");
});
});
13 changes: 13 additions & 0 deletions packages/just-bash/src/commands/search-engine/matcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ export function searchContent(
showByteOffset,
replace,
kResetGroup,
preFilter,
});
}

Expand Down Expand Up @@ -465,6 +466,7 @@ function searchContentMultiline(
showByteOffset: boolean;
replace: string | null;
kResetGroup?: number;
preFilter?: PreFilter | null;
},
): SearchResult {
const {
Expand All @@ -482,8 +484,19 @@ function searchContentMultiline(
showByteOffset,
replace,
kResetGroup,
preFilter,
} = options;

// File-level preFilter: if no needle appears anywhere in the content, no line can match.
// Only safe when not inverting — an invert-match scan must check every line.
if (preFilter && !invertMatch && !preFilterMatches(preFilter, content)) {
if (countOnly || countMatches) {
const countStr = filename ? `${filename}:0` : "0";
return { output: `${countStr}\n`, matched: false, matchCount: 0 };
}
return { output: "", matched: false, matchCount: 0 };
}

const lines = content.split("\n");
const lineCount = lines.length;
const lastIdx =
Expand Down
71 changes: 63 additions & 8 deletions packages/just-bash/src/commands/search-engine/regex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,47 @@ describe("buildRegex preFilter — happy path", () => {
ignoreCase: false,
});
});

it("extracts needle from leading-anchored literal ^foo", () => {
expect(buildRegex("^foo", { mode: "extended" }).preFilter).toEqual({
needles: ["foo"],
ignoreCase: false,
});
});

it("extracts needle from trailing-anchored literal foo$", () => {
expect(buildRegex("foo$", { mode: "extended" }).preFilter).toEqual({
needles: ["foo"],
ignoreCase: false,
});
});

it("extracts needle from fully-anchored literal ^foo$", () => {
expect(buildRegex("^foo$", { mode: "extended" }).preFilter).toEqual({
needles: ["foo"],
ignoreCase: false,
});
});

it("extracts needles from anchored alternation ^def |^async def (the issue case)", () => {
// BRE: \| is alternation. This is the canonical pattern from issue #89.
expect(
buildRegex("^def \\|^async def ", { mode: "basic" }).preFilter,
).toEqual({ needles: ["def ", "async def "], ignoreCase: false });
});

it("extracts needles from mixed anchored/unanchored alternation", () => {
expect(buildRegex("^foo|bar|baz$", { mode: "extended" }).preFilter).toEqual(
{ needles: ["foo", "bar", "baz"], ignoreCase: false },
);
});

it("preserves literal $ when escaped (foo\\$ → 'foo$')", () => {
expect(buildRegex("foo\\$", { mode: "extended" }).preFilter).toEqual({
needles: ["foo$"],
ignoreCase: false,
});
});
});

describe("buildRegex preFilter — safety (must NOT extract)", () => {
Expand All @@ -91,14 +132,6 @@ describe("buildRegex preFilter — safety (must NOT extract)", () => {
expect(buildRegex("[abc]", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects start anchor ^", () => {
expect(buildRegex("^foo", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects end anchor $", () => {
expect(buildRegex("foo$", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects . (matches any char)", () => {
// basic mode: . is the dot meta in BRE, not escaped
expect(buildRegex("a.b", { mode: "basic" }).preFilter).toBeUndefined();
Expand Down Expand Up @@ -156,6 +189,28 @@ describe("buildRegex preFilter — safety (must NOT extract)", () => {
buildRegex("\\u2764", { mode: "extended" }).preFilter,
).toBeUndefined();
});

it("rejects bare ^ (anchor-only, no useful needle)", () => {
expect(buildRegex("^", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects bare $ (anchor-only, no useful needle)", () => {
expect(buildRegex("$", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects ^$ (line-boundary anchor pair, no needle)", () => {
expect(buildRegex("^$", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects ^a|$ (one branch has no needle)", () => {
expect(buildRegex("^a|$", { mode: "extended" }).preFilter).toBeUndefined();
});

it("rejects mid-alternative ^ (literal ^ in middle is meta)", () => {
expect(
buildRegex("foo^bar", { mode: "extended" }).preFilter,
).toBeUndefined();
});
});

describe("buildRegex preFilter — structural correctness", () => {
Expand Down
23 changes: 20 additions & 3 deletions packages/just-bash/src/commands/search-engine/regex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,28 @@ function splitTopLevelAlternation(pattern: string): string[] | null {
* something other than itself (quantifier, anchor, character class, group, dot).
*/
function literalFromAlternative(alt: string): string | null {
// Strip a leading unescaped ^ anchor.
let inner = alt;
if (inner.startsWith("^")) {
inner = inner.slice(1);
}
// Strip a trailing unescaped $ anchor. Walk back the run of trailing
// backslashes: $ is an anchor iff that run has even length.
if (inner.endsWith("$")) {
let bs = 0;
for (let i = inner.length - 2; i >= 0 && inner[i] === "\\"; i--) bs++;
if (bs % 2 === 0) {
inner = inner.slice(0, -1);
}
}
// Anchor-only alternative — no useful needle.
if (inner.length === 0) return null;

let out = "";
for (let i = 0; i < alt.length; i++) {
const c = alt[i];
for (let i = 0; i < inner.length; i++) {
const c = inner[i];
if (c === "\\") {
const next = alt[i + 1];
const next = inner[i + 1];
if (next === undefined) return null;
// Reject escapes that aren't simple literal substitutions.
// \n, \t, \r are literal whitespace — fine. \d, \w, \s, \b, \B etc.
Expand Down
Loading
Loading