Skip to content

Commit b8b24f0

Browse files
authored
Merge pull request #23 from AztecProtocol/chore/search-cleanup-and-error-ref-trim
chore(search,error-lookup): execFileSync for rg + trim noisy code refs
2 parents 80f17fe + 23228ff commit b8b24f0

4 files changed

Lines changed: 391 additions & 72 deletions

File tree

src/utils/error-lookup.ts

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,50 @@ function getAllEntries(): ErrorEntry[] {
369369
return [...STATIC_ERROR_CATALOG, ...getDynamicEntries()];
370370
}
371371

372+
// ---------------------------------------------------------------------------
373+
// Code-reference filtering
374+
// ---------------------------------------------------------------------------
375+
376+
const TEST_PATH_RE = /\.(test|spec|e2e)\.ts$/i;
377+
// Segment-bounded: the named segment must be a full path component
378+
// (preceded by a path boundary — start-of-string or ``/`` — and
379+
// followed by ``/``). So ``/test/`` and ``tests/foo.ts`` match, but
380+
// ``/contest/``, ``/latest/``, ``/testdata/``, ``/attestations/`` do
381+
// NOT — they're real source dirs that incidentally contain those
382+
// substrings.
383+
const TEST_DIR_RE = /(?:^|\/)(?:tests?|__tests__|e2e|fixtures?|mocks?)\//i;
384+
385+
/**
386+
* Drop OBVIOUSLY content-thin lines: very long single-line content
387+
* dominated by a hex run with no whitespace breaks. Tuned to catch
388+
* generated bytecode and hex blobs while leaving regex literals,
389+
* long template strings, and selector maps alone — those still help
390+
* a human navigate to the right file.
391+
*/
392+
export function isMinifiedShape(content: string): boolean {
393+
const trimmed = content.trim();
394+
if (trimmed.length < 400) return false;
395+
const longestHexRun = (trimmed.match(/[0-9a-fA-F]{200,}/g) ?? [])
396+
.reduce((m, t) => Math.max(m, t.length), 0);
397+
return longestHexRun >= 200;
398+
}
399+
400+
/**
401+
* Heuristic: is this `lookup_error` code reference likely to actually
402+
* help the user understand or fix their error? Drops obvious test
403+
* files and minified bytecode lines; keeps everything else.
404+
*
405+
* Path-based exclusions are the primary signal; the content-shape
406+
* filter is deliberately narrow to avoid dropping legitimate long
407+
* source lines.
408+
*/
409+
export function isUsefulCodeRef(r: SearchResult): boolean {
410+
if (TEST_PATH_RE.test(r.file)) return false;
411+
if (TEST_DIR_RE.test(r.file)) return false;
412+
if (isMinifiedShape(r.content)) return false;
413+
return true;
414+
}
415+
372416
// ---------------------------------------------------------------------------
373417
// Matching algorithm
374418
// ---------------------------------------------------------------------------
@@ -466,15 +510,22 @@ export function lookupError(
466510
matches.sort((a, b) => b.score - a.score);
467511
const catalogMatches = matches.slice(0, maxResults);
468512

469-
// Fallback: if fewer than 3 catalog matches, search code
513+
// Fallback: if fewer than 3 catalog matches, search code. Over-fetch
514+
// (RAW_CODE_LIMIT) so the post-filter has headroom — if all of the
515+
// top hits are tests / fixtures / minified noise, slicing the raw
516+
// result to 2 would otherwise return zero useful refs even when good
517+
// ones exist deeper in the result set.
518+
const RAW_CODE_LIMIT = 20;
519+
const KEEP_CODE_LIMIT = 2;
470520
let codeMatches: SearchResult[] = [];
471521
if (catalogMatches.length < 3) {
472522
try {
473-
codeMatches = searchCode(query, {
523+
const raw = searchCode(query, {
474524
filePattern: "*.ts",
475525
repo: "aztec-packages",
476-
maxResults: Math.min(maxResults, 5),
526+
maxResults: RAW_CODE_LIMIT,
477527
});
528+
codeMatches = raw.filter(isUsefulCodeRef).slice(0, KEEP_CODE_LIMIT);
478529
} catch {
479530
// Repos may not be cloned — that's fine
480531
}

src/utils/search.ts

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Search utilities for finding content in cloned repositories
33
*/
44

5-
import { execSync } from "child_process";
5+
import { execFileSync } from "child_process";
66
import { existsSync, readFileSync } from "fs";
77
import { join, relative, extname } from "path";
88
import { globbySync } from "globby";
@@ -43,28 +43,37 @@ export function searchCode(
4343
}
4444

4545
try {
46-
// Try ripgrep first (fast)
47-
const rgFlags = [
48-
caseSensitive ? "" : "-i",
49-
"-n", // line numbers
46+
// Build argv for ripgrep. Using execFileSync (NOT execSync) skips
47+
// the shell layer, so glob patterns like ``*.{nr,ts}`` reach rg
48+
// verbatim instead of being brace-expanded by ``/bin/sh`` and then
49+
// glob-expanded against the node-process cwd. The query is passed
50+
// via ``-e`` so a query starting with a dash isn't reparsed as an
51+
// rg flag, and ``--`` ends flag parsing before the search path.
52+
const rgArgs: string[] = [];
53+
if (!caseSensitive) rgArgs.push("-i");
54+
rgArgs.push(
55+
"-n",
5056
"--no-heading",
5157
"-g",
5258
filePattern,
5359
"-m",
54-
String(maxResults * 2), // Get more, then trim
55-
]
56-
.filter(Boolean)
57-
.join(" ");
60+
String(maxResults * 2),
61+
"-e",
62+
query,
63+
"--",
64+
searchPath,
65+
);
5866

59-
const result = execSync(`rg ${rgFlags} "${escapeShell(query)}" "${searchPath}"`, {
67+
const result = execFileSync("rg", rgArgs, {
6068
encoding: "utf-8",
6169
maxBuffer: 10 * 1024 * 1024,
6270
timeout: 30000,
6371
});
6472

6573
return parseRgOutput(result, maxResults);
66-
} catch (error) {
67-
// Ripgrep not found or no matches, fall back to manual search
74+
} catch {
75+
// Ripgrep not found, no matches (rg exits 1), or other failure:
76+
// fall back to the in-process globby + readFile loop.
6877
return manualSearch(query, searchPath, filePattern, maxResults, caseSensitive);
6978
}
7079
}
@@ -207,14 +216,6 @@ export function getResultPriority(result: SearchResult): number {
207216

208217
// --- Helper functions ---
209218

210-
/**
211-
* Escape a string for safe use inside double quotes in a shell command.
212-
* Preserves regex syntax (|, *, +, etc.) while preventing shell injection.
213-
*/
214-
function escapeShell(str: string): string {
215-
return str.replace(/["$`\\!]/g, "\\$&");
216-
}
217-
218219
function parseRgOutput(output: string, maxResults: number): SearchResult[] {
219220
const results: SearchResult[] = [];
220221
const lines = output.split("\n").filter(Boolean);

tests/utils/error-lookup.test.ts

Lines changed: 182 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ vi.mock("fs", () => ({
88
}));
99

1010
vi.mock("child_process", () => ({
11-
execSync: vi.fn(),
11+
execFileSync: vi.fn(),
1212
}));
1313

1414
vi.mock("globby", () => ({
@@ -21,6 +21,7 @@ vi.mock("../../src/utils/git.js", () => ({
2121
isRepoCloned: vi.fn(() => false),
2222
}));
2323

24+
import { execFileSync } from "child_process";
2425
import { existsSync, readFileSync } from "fs";
2526
import {
2627
parseSolidityErrors,
@@ -29,10 +30,13 @@ import {
2930
parseOperatorFaq,
3031
lookupError,
3132
clearErrorCache,
33+
isUsefulCodeRef,
34+
isMinifiedShape,
3235
} from "../../src/utils/error-lookup.js";
3336

3437
const mockExistsSync = vi.mocked(existsSync);
3538
const mockReadFileSync = vi.mocked(readFileSync);
39+
const mockExecFileSync = vi.mocked(execFileSync);
3640

3741
beforeEach(() => {
3842
vi.clearAllMocks();
@@ -260,3 +264,180 @@ describe("lookupError", () => {
260264
}
261265
});
262266
});
267+
268+
// ---------------------------------------------------------------------------
269+
// Code-reference filter — unit
270+
// ---------------------------------------------------------------------------
271+
272+
describe("isUsefulCodeRef", () => {
273+
const ok = (file: string, content = "export const FOO = 1;") => ({
274+
file,
275+
content,
276+
repo: "aztec-packages",
277+
line: 1,
278+
});
279+
280+
it("keeps a normal source line", () => {
281+
expect(isUsefulCodeRef(ok("yarn-project/foo.ts"))).toBe(true);
282+
});
283+
284+
it("drops .test.ts files", () => {
285+
expect(isUsefulCodeRef(ok("yarn-project/foo.test.ts"))).toBe(false);
286+
});
287+
288+
it("drops .spec.ts files", () => {
289+
expect(isUsefulCodeRef(ok("yarn-project/foo.spec.ts"))).toBe(false);
290+
});
291+
292+
it("drops .e2e.ts files", () => {
293+
expect(isUsefulCodeRef(ok("yarn-project/foo.e2e.ts"))).toBe(false);
294+
});
295+
296+
it("drops files inside __tests__ directories", () => {
297+
expect(isUsefulCodeRef(ok("yarn-project/__tests__/foo.ts"))).toBe(false);
298+
});
299+
300+
it("drops files inside /test/ and /tests/ directories", () => {
301+
expect(isUsefulCodeRef(ok("yarn-project/test/foo.ts"))).toBe(false);
302+
expect(isUsefulCodeRef(ok("yarn-project/tests/foo.ts"))).toBe(false);
303+
});
304+
305+
it("drops files inside /e2e/ and /fixtures/ and /mocks/", () => {
306+
expect(isUsefulCodeRef(ok("yarn-project/e2e/foo.ts"))).toBe(false);
307+
expect(isUsefulCodeRef(ok("yarn-project/fixtures/foo.ts"))).toBe(false);
308+
expect(isUsefulCodeRef(ok("yarn-project/mocks/foo.ts"))).toBe(false);
309+
});
310+
311+
it("does NOT drop on incidental substrings (latest, contest, attestations, testdata)", () => {
312+
// The dir regex is segment-bounded — these are real source dirs
313+
// that incidentally contain ``test``/``latest``/``contest`` as
314+
// substrings, NOT as full path segments. They must survive.
315+
expect(isUsefulCodeRef(ok("yarn-project/latest/foo.ts"))).toBe(true);
316+
expect(isUsefulCodeRef(ok("yarn-project/contest/foo.ts"))).toBe(true);
317+
expect(isUsefulCodeRef(ok("yarn-project/attestations/foo.ts"))).toBe(true);
318+
expect(isUsefulCodeRef(ok("yarn-project/testdata/foo.ts"))).toBe(true);
319+
});
320+
321+
it("drops paths that begin with a test segment (no leading slash)", () => {
322+
// Ripgrep returns relative-from-REPOS_DIR paths, but a relative
323+
// path could in theory start with a test segment if the repo
324+
// itself is named ``tests`` or similar. The leading-boundary in
325+
// TEST_DIR_RE handles this; locking it in here.
326+
expect(isUsefulCodeRef(ok("tests/foo.ts"))).toBe(false);
327+
expect(isUsefulCodeRef(ok("__tests__/foo.ts"))).toBe(false);
328+
});
329+
330+
it("drops minified-shape lines (long pure-hex run)", () => {
331+
// 600 chars of hex — clearly bytecode.
332+
const hex = "deadbeef".repeat(75);
333+
expect(isUsefulCodeRef(ok("yarn-project/foo.ts", hex))).toBe(false);
334+
});
335+
336+
it("keeps long regex literals (no continuous hex run)", () => {
337+
// A regex literal that's long but not hex-shaped.
338+
const line =
339+
"const re = /[A-Za-z0-9_]{200}.*([gimsuy]|[A-Z]+){50}/.test(input);";
340+
expect(isUsefulCodeRef(ok("yarn-project/foo.ts", line))).toBe(true);
341+
});
342+
343+
it("keeps short lines that contain hex literals", () => {
344+
const line = "const sig = 0xdeadbeefcafebabe; // selector";
345+
expect(isUsefulCodeRef(ok("yarn-project/foo.ts", line))).toBe(true);
346+
});
347+
});
348+
349+
describe("isMinifiedShape", () => {
350+
it("returns false for short content", () => {
351+
expect(isMinifiedShape("a".repeat(399))).toBe(false);
352+
});
353+
354+
it("returns true only when a 200+ char hex run is present", () => {
355+
const hex = "deadbeef".repeat(25); // 200 chars of hex
356+
const padded = "x".repeat(200) + hex;
357+
expect(padded.length).toBeGreaterThanOrEqual(400);
358+
expect(isMinifiedShape(padded)).toBe(true);
359+
});
360+
361+
it("returns false when long but no contiguous hex run", () => {
362+
// 500 chars of mixed alphanumeric, but no 200+ hex run.
363+
const line = ("aZ0_".repeat(125)).slice(0, 500);
364+
expect(isMinifiedShape(line)).toBe(false);
365+
});
366+
367+
it("returns false for a long generated-looking source line without hex blob", () => {
368+
// Realistic noisy generated TypeScript: long but full of identifiers,
369+
// commas, brackets — semantic content a human can still navigate.
370+
// No 200-char contiguous hex run, so the heuristic must keep it.
371+
const line =
372+
"export const TX_ERROR_CODES = { " +
373+
Array.from({ length: 60 }, (_, i) => `ERR_${i}: 'message ${i}'`).join(", ") +
374+
" };";
375+
expect(line.length).toBeGreaterThan(400);
376+
expect(isMinifiedShape(line)).toBe(false);
377+
});
378+
});
379+
380+
// ---------------------------------------------------------------------------
381+
// Code-reference filter — integration via lookupError
382+
// ---------------------------------------------------------------------------
383+
384+
describe("lookupError code-ref over-fetch + filter + cap", () => {
385+
beforeEach(() => {
386+
// searchCode (called from lookupError) checks existsSync and shells
387+
// out to ripgrep via execFileSync — the integration test pipes a
388+
// synthetic rg stdout through the parser.
389+
mockExistsSync.mockReturnValue(true);
390+
mockExecFileSync.mockReset();
391+
});
392+
393+
it("survives a top-of-result-set full of tests/minified — keeps deeper real refs", () => {
394+
// Build a synthetic ripgrep stdout: 5 test files + 1 minified line +
395+
// 2 real source lines. Pre-filter slice to 5 (the old behaviour)
396+
// would have produced ZERO useful refs; over-fetch (RAW_CODE_LIMIT
397+
// = 20) plus filter must surface the two real ones.
398+
const minified = "deadbeef".repeat(75); // 600 hex chars
399+
const lines = [
400+
"/fake/repos/aztec-packages/yarn-project/foo.test.ts:1:test 1",
401+
"/fake/repos/aztec-packages/yarn-project/__tests__/bar.ts:2:test 2",
402+
"/fake/repos/aztec-packages/yarn-project/baz.spec.ts:3:test 3",
403+
"/fake/repos/aztec-packages/yarn-project/qux.e2e.ts:4:test 4",
404+
"/fake/repos/aztec-packages/yarn-project/mocks/zap.ts:5:test 5",
405+
`/fake/repos/aztec-packages/yarn-project/abi.ts:6:${minified}`,
406+
"/fake/repos/aztec-packages/yarn-project/real-one.ts:7:export const REAL = 1;",
407+
"/fake/repos/aztec-packages/yarn-project/real-two.ts:8:export function realTwo() {}",
408+
].join("\n");
409+
mockExecFileSync.mockReturnValue(lines);
410+
411+
// Use a query that won't strongly hit the static catalog so the
412+
// fallback (codeMatches) path is taken.
413+
const result = lookupError("xyzzy_unknown_query_for_test_only");
414+
415+
expect(result.codeMatches).toHaveLength(2);
416+
expect(result.codeMatches.map((m) => m.file)).toEqual([
417+
"aztec-packages/yarn-project/real-one.ts",
418+
"aztec-packages/yarn-project/real-two.ts",
419+
]);
420+
421+
// Lock in the over-fetch contract: the raw cap must be wide enough
422+
// for the filter to have headroom. searchCode passes
423+
// ``String(maxResults * 2)`` to rg via -m, so RAW_CODE_LIMIT (20)
424+
// surfaces as ``-m 40`` on the wire.
425+
const calls = mockExecFileSync.mock.calls;
426+
expect(calls.length).toBeGreaterThan(0);
427+
const args = calls[calls.length - 1][1] as string[];
428+
const mIdx = args.indexOf("-m");
429+
expect(mIdx).toBeGreaterThanOrEqual(0);
430+
expect(args[mIdx + 1]).toBe("40");
431+
});
432+
433+
it("caps to 2 even when many useful refs are returned", () => {
434+
const lines = Array.from({ length: 10 }, (_, i) =>
435+
`/fake/repos/aztec-packages/yarn-project/foo${i}.ts:${i}:export const F${i} = ${i};`,
436+
).join("\n");
437+
mockExecFileSync.mockReturnValue(lines);
438+
439+
const result = lookupError("xyzzy_unknown_query_for_test_only_2");
440+
441+
expect(result.codeMatches).toHaveLength(2);
442+
});
443+
});

0 commit comments

Comments
 (0)