@@ -292,6 +292,152 @@ describe("lookupAztecError — semantic fallback", () => {
292292 } ) ;
293293} ) ;
294294
295+ describe ( "lookupAztecError — content-thin chunk filter" , ( ) => {
296+ /**
297+ * Defense-in-depth filter: even if docsgpt's `/api/search` regresses
298+ * and starts returning path-only / empty-body apiref chunks,
299+ * `isUsefulSemanticChunk` drops them before they're surfaced to the
300+ * LLM consumer. Mirrors the server-side
301+ * `_is_empty_apiref_chunk` helper.
302+ */
303+ function chunk ( text : string , source = "aztec-nr/aztec/src/foo.nr" ) {
304+ return { text, title : "foo.nr" , source } ;
305+ }
306+
307+ it ( "drops chunks with `#`-prefixed path heading even when source field is a public URL" , async ( ) => {
308+ /**
309+ * Regression for codex review: `/api/search` rewrites the chunk's
310+ * `source` field to a public URL via `_aztec_source_url`. A chunk
311+ * whose body is `# aztec-nr/.../foo.nr` (path heading only) won't
312+ * match the URL-rewritten source field by string equality. The
313+ * earlier filter would fail to strip the heading, then fall through
314+ * to the path-shape check — which also failed because `# ...` has
315+ * whitespace from the markdown marker. The new shape-only filter
316+ * catches this directly.
317+ */
318+ const client = makeClient ( {
319+ search : vi . fn ( ) . mockResolvedValue ( [
320+ {
321+ text : "# aztec-nr/aztec/src/context/foo.nr\n" ,
322+ title : "foo.nr" ,
323+ source : "https://github.com/AztecProtocol/aztec-packages/blob/v4.2.0/noir-projects/aztec-nr/aztec/src/context/foo.nr" ,
324+ } ,
325+ ] ) ,
326+ } ) ;
327+ const result = await lookupAztecError ( { query : "obscure" } , client ) ;
328+ expect ( result . semanticHealth ) . toBe ( "no_results" ) ;
329+ } ) ;
330+
331+ it ( "treats raw output of all path-only chunks as 'no_results'" , async ( ) => {
332+ const client = makeClient ( {
333+ search : vi . fn ( ) . mockResolvedValue ( [
334+ chunk ( "\n\naztec-nr/aztec/src/context/note_existence_request.nr\n\n" ,
335+ "aztec-nr/aztec/src/context/note_existence_request.nr" ) ,
336+ chunk ( "\n\naztec-nr/aztec/src/note/hinted_note.nr\n" ,
337+ "aztec-nr/aztec/src/note/hinted_note.nr" ) ,
338+ ] ) ,
339+ } ) ;
340+ const result = await lookupAztecError ( { query : "obscure" } , client ) ;
341+ expect ( result . semanticHealth ) . toBe ( "no_results" ) ;
342+ expect ( result . semanticResults ) . toBeUndefined ( ) ;
343+ } ) ;
344+
345+ it ( "keeps mixed results when at least one chunk has substantive body" , async ( ) => {
346+ const client = makeClient ( {
347+ search : vi . fn ( ) . mockResolvedValue ( [
348+ chunk ( "\n\naztec-nr/aztec/src/empty.nr\n" ,
349+ "aztec-nr/aztec/src/empty.nr" ) ,
350+ chunk (
351+ "# aztec-nr/aztec/src/hash.nr\npub fn poseidon(input: [Field; N]) -> Field" ,
352+ "aztec-nr/aztec/src/hash.nr"
353+ ) ,
354+ chunk ( "\n\naztec-nr/aztec/src/utils.nr\n" ,
355+ "aztec-nr/aztec/src/utils.nr" ) ,
356+ ] ) ,
357+ } ) ;
358+ const result = await lookupAztecError ( { query : "poseidon" } , client ) ;
359+ expect ( result . semanticHealth ) . toBe ( "ok" ) ;
360+ expect ( result . semanticResults ) . toHaveLength ( 1 ) ;
361+ expect ( result . semanticResults ! [ 0 ] . text ) . toContain ( "poseidon" ) ;
362+ } ) ;
363+ } ) ;
364+
365+ describe ( "lookupAztecError — weak catalog suppression when semantic is useful" , ( ) => {
366+ /**
367+ * The user-reported "bogus result still appears" failure mode: weak
368+ * catalog hits visible alongside semantic results lets the LLM
369+ * consumer anchor on the wrong answer. When semantic returned
370+ * useful (post-filter) chunks, the weak catalog is now suppressed
371+ * from the rendered output entirely (still present in
372+ * `result.catalogMatches` for programmatic consumers).
373+ *
374+ * This tests the data-shape that the formatter consumes; the
375+ * formatter test (`tests/utils/format.test.ts`) verifies the
376+ * suppression actually happens at render time.
377+ */
378+ it ( "returns semanticHealth='ok' with weak catalog still in result.catalogMatches" , async ( ) => {
379+ mockLookupError . mockReturnValue ( {
380+ query : "note already nullified" ,
381+ catalogMatches : [
382+ catalogHit ( 54 , "Contract already initialized" , "word-overlap" ) ,
383+ ] ,
384+ codeMatches : [ ] ,
385+ } ) ;
386+
387+ const client = makeClient ( {
388+ search : vi . fn ( ) . mockResolvedValue ( [
389+ {
390+ text : "Notes in Aztec are nullified by emitting a nullifier..." ,
391+ title : "Note Lifecycle" ,
392+ source : "docs/notes.md" ,
393+ } ,
394+ ] ) ,
395+ } ) ;
396+
397+ const result = await lookupAztecError (
398+ { query : "note already nullified" } ,
399+ client
400+ ) ;
401+ expect ( result . semanticHealth ) . toBe ( "ok" ) ;
402+ expect ( result . semanticResults ) . toHaveLength ( 1 ) ;
403+ // The weak catalog hit is preserved in the data — the formatter
404+ // is responsible for hiding it. Programmatic consumers can still
405+ // see all signals.
406+ expect ( result . result . catalogMatches ) . toHaveLength ( 1 ) ;
407+ expect ( result . result . catalogMatches [ 0 ] . score ) . toBe ( 54 ) ;
408+ } ) ;
409+
410+ it ( "when semantic is filtered out (all path-only) AND catalog is weak, keeps catalog" , async ( ) => {
411+ mockLookupError . mockReturnValue ( {
412+ query : "note already nullified" ,
413+ catalogMatches : [
414+ catalogHit ( 54 , "Contract already initialized" , "word-overlap" ) ,
415+ ] ,
416+ codeMatches : [ ] ,
417+ } ) ;
418+
419+ const client = makeClient ( {
420+ search : vi . fn ( ) . mockResolvedValue ( [
421+ // Path-only chunks that the filter will drop
422+ { text : "\n\naztec-nr/aztec/src/foo.nr\n" ,
423+ title : "foo.nr" ,
424+ source : "aztec-nr/aztec/src/foo.nr" } ,
425+ ] ) ,
426+ } ) ;
427+
428+ const result = await lookupAztecError (
429+ { query : "note already nullified" } ,
430+ client
431+ ) ;
432+ // semantic returned empty (after filter) → no_results
433+ expect ( result . semanticHealth ) . toBe ( "no_results" ) ;
434+ // Weak catalog stays in the result so the user has *some* signal
435+ expect ( result . result . catalogMatches ) . toHaveLength ( 1 ) ;
436+ expect ( result . message ) . toContain ( "low-confidence" ) ;
437+ expect ( result . message ) . toMatch ( / n o r e l e v a n t d o c u m e n t a t i o n | S e m a n t i c s e a r c h / i) ;
438+ } ) ;
439+ } ) ;
440+
295441describe ( "lookupAztecError — semantic failure (sanitized message)" , ( ) => {
296442 it ( "sets semanticHealth='failed' and returns sanitized message on 401" , async ( ) => {
297443 const client = makeClient ( {
@@ -325,7 +471,7 @@ describe("lookupAztecError — version-mismatch gate", () => {
325471 it ( "blocks semantic fallback when local clone diverges from corpus" , async ( ) => {
326472 mockGetRepoTag . mockResolvedValue ( "v4.1.0" ) ;
327473 const client = makeClient ( {
328- search : vi . fn ( ) . mockResolvedValue ( [ { text : "x " , title : "x " , source : "x" } ] ) ,
474+ search : vi . fn ( ) . mockResolvedValue ( [ { text : "Some prose body content here. " , title : "T " , source : "x" } ] ) ,
329475 getCorpusVersion : vi . fn ( ) . mockResolvedValue ( { aztec_corpus_version : "v4.2.0" } ) ,
330476 } ) ;
331477
@@ -340,7 +486,7 @@ describe("lookupAztecError — version-mismatch gate", () => {
340486 mockGetRepoTag . mockResolvedValue ( "v4.1.0" ) ;
341487 const client = makeClient ( {
342488 search : vi . fn ( ) . mockResolvedValue ( [
343- { text : "x " , title : "x " , source : "x" } ,
489+ { text : "Some prose body content here. " , title : "T " , source : "x" } ,
344490 ] ) ,
345491 getCorpusVersion : vi . fn ( ) . mockResolvedValue ( { aztec_corpus_version : "v4.2.0" } ) ,
346492 } ) ;
0 commit comments