Skip to content

Commit 9e05b4b

Browse files
DvirDukhanCopilot
andcommitted
fix(mcp): address PR #701 review comments + green CI
structural.py: - search_code: relevance floor — drop files with zero raw lexical signal (name-exact + path-overlap + raw BM25) so a nonsense query returns [] instead of centrality-ranked noise. - _hybrid_components: deterministic representative-symbol selection via a stable sort key (exact query-id match, then src_start, name, src_end) — no longer depends on FalkorDB row order; fixes nondeterministic snippet/name. - find_path: keep the node label (a path is bare nodes with no per-hop relation, so the label is the only type signal); restores the edge-leak guard. - get_file_neighbors: include file_id in both early-return shapes for a consistent schema. tests: - _find_id resolves a symbol id directly from the graph (search_code is file-oriented and no longer returns per-symbol ids) in test_query_tools.py and test_impact_analysis.py, with a uniqueness assertion. - search_code tests use the file-oriented query= API. 53/53 mcp tests pass against live FalkorDB; ruff clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent d909649 commit 9e05b4b

3 files changed

Lines changed: 105 additions & 48 deletions

File tree

api/mcp/tools/structural.py

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,22 @@ def _minmax(d: dict[str, float]) -> dict[str, float]:
100100
return {k: (v - lo) / (hi - lo) for k, v in d.items()}
101101

102102

103+
def _rep_key(d: dict[str, Any]) -> tuple:
104+
"""Stable ordering key for a file's representative-symbol candidate.
105+
106+
Lower is better: exact query-id match first, then lowest ``src_start``,
107+
then ``name`` then ``src_end`` as deterministic tie-breakers so the chosen
108+
representative never depends on FalkorDB row order (even when ``src_start``
109+
ties or is missing).
110+
"""
111+
return (
112+
0 if d.get("exact") else 1,
113+
d["src_start"] if d.get("src_start") is not None else math.inf,
114+
d.get("name") or "",
115+
d["src_end"] if d.get("src_end") is not None else math.inf,
116+
)
117+
118+
103119
def _bm25(query_tokens: set[str], files: list[str],
104120
tokmap: dict[str, list[str]]) -> dict[str, float]:
105121
docs = [tokmap.get(f, []) for f in files]
@@ -141,7 +157,11 @@ async def _hybrid_rank(g, query: str, project: Optional[str]) -> list[dict[str,
141157
symbol per file, used for the snippet). Pure read; no graph mutation.
142158
"""
143159
files, comps, rep, abs_of, file_id_of = await _hybrid_components(g, query, project)
144-
return _hybrid_score(files, comps, rep, abs_of, file_id_of)
160+
scored = _hybrid_score(files, comps, rep, abs_of, file_id_of)
161+
# Relevance floor: a query with no lexical overlap (e.g. a nonsense token)
162+
# would otherwise be ranked purely by query-independent centrality and
163+
# return noise. Drop files with no lexical signal so such queries yield [].
164+
return [r for r in scored if r.get("lex", 0.0) > 0]
145165

146166

147167
async def _hybrid_components(g, query: str, project: Optional[str]):
@@ -194,14 +214,19 @@ def rel(p: Optional[str]) -> str:
194214
continue
195215
if name:
196216
bodytok[rp].extend(_subtokens(name))
197-
if name.lower() in qids:
217+
is_exact = name.lower() in qids
218+
if is_exact:
198219
name_exact[rp] += 1.0
199-
rep.setdefault(rp, {"name": name, "src_start": start, "src_end": end})
200-
if rp not in rep and name:
220+
# Representative symbol for the file's snippet: prefer one whose name
221+
# exactly matches a query identifier, otherwise the lowest-``src_start``
222+
# symbol. Fully deterministic regardless of result-set order via a
223+
# stable sort key (exact first, then src_start, then name, then
224+
# src_end) so ties / missing ``src_start`` never depend on row order.
225+
cand = {"name": name, "src_start": start, "src_end": end,
226+
"exact": is_exact}
201227
cur = rep.get(rp)
202-
if cur is None or (start is not None and (
203-
cur.get("src_start") is None or start < cur["src_start"])):
204-
rep[rp] = {"name": name, "src_start": start, "src_end": end}
228+
if cur is None or _rep_key(cand) < _rep_key(cur):
229+
rep[rp] = cand
205230
if doc and body_used[rp] < _HYBRID_BODY_TOKEN_CAP:
206231
toks = _tokenize(doc)[: _HYBRID_BODY_TOKEN_CAP - body_used[rp]]
207232
bodytok[rp].extend(toks)
@@ -221,9 +246,10 @@ def rel(p: Optional[str]) -> str:
221246
return [], {}, {}, {}, {}
222247

223248
path_overlap = {f: float(len(qtok & set(pathtok.get(f, [])))) for f in files}
249+
raw_bm25 = _bm25(qtok, files, bodytok)
224250
n_name = _minmax(name_exact if name_exact else {f: 0.0 for f in files})
225251
n_path = _minmax(path_overlap)
226-
n_bm25 = _minmax(_bm25(qtok, files, bodytok))
252+
n_bm25 = _minmax(raw_bm25)
227253
n_cent = _minmax({f: centrality.get(f, 0.0) for f in files})
228254

229255
comps: dict[str, dict[str, float]] = {}
@@ -234,6 +260,13 @@ def rel(p: Optional[str]) -> str:
234260
"bm25": n_bm25.get(f, 0.0),
235261
"cent": n_cent.get(f, 0.0),
236262
"pen": _HYBRID_W_PEN if _PENALTY_RE.search(f) else 0.0,
263+
# Raw (un-normalized) query-dependent signal. A file with zero
264+
# lexical overlap (name/path/body) is not relevant to the query —
265+
# only query-independent centrality could rank it — so search_code
266+
# drops it rather than returning noise for an unmatched query.
267+
"lex": (name_exact.get(f, 0.0)
268+
+ path_overlap.get(f, 0.0)
269+
+ raw_bm25.get(f, 0.0)),
237270
}
238271

239272
return files, comps, rep, abs_of, file_id_of
@@ -271,6 +304,7 @@ def _hybrid_score(
271304
"name": r.get("name"),
272305
"src_start": r.get("src_start"),
273306
"src_end": r.get("src_end"),
307+
"lex": c.get("lex", 0.0),
274308
})
275309
scored.sort(key=lambda d: -d["score"])
276310
return scored
@@ -575,9 +609,10 @@ def _node_summary(
575609
``encode_node`` returns ``{id, labels, properties: {...}}`` because Node
576610
properties live on a nested attribute. Agents want a flat record. We keep
577611
the single meaningful ``label`` (File, Class, Function — not the fulltext
578-
marker ``Searchable``) only for ``search_code``, where File-vs-Function
579-
disambiguation matters; neighbor/path results omit it via ``with_label``
580-
since the relation already implies the type.
612+
marker ``Searchable``) for ``search_code`` (File-vs-Function disambiguation)
613+
and for ``find_path`` (a path is bare nodes with no per-hop relation, so the
614+
label is the only type signal). Single-hop neighbor results omit it via
615+
``with_label`` since the relation already implies the node type.
581616
582617
When ``rel_to`` is given (the project/worktree identifier), ``file`` is
583618
relativized to drop the absolute worktree prefix.
@@ -893,7 +928,7 @@ async def find_path(
893928
paths: list[dict[str, Any]] = []
894929
for entry in raw:
895930
node_seq = [
896-
_node_summary(x, rel_to=project, with_label=False)
931+
_node_summary(x, rel_to=project, with_label=True)
897932
for x in entry
898933
# Discriminate on ``labels``: ``encode_node`` emits a top-level
899934
# ``labels`` key, while ``encode_edge`` does not (edges carry
@@ -1036,6 +1071,7 @@ async def get_file_neighbors(
10361071
if abs_path is None:
10371072
return {
10381073
"file": _relativize(str(file), project),
1074+
"file_id": None,
10391075
"total_neighbors": 0,
10401076
"truncated": False,
10411077
"neighbors": [],
@@ -1051,6 +1087,7 @@ async def get_file_neighbors(
10511087
if not ids:
10521088
return {
10531089
"file": _relativize(abs_path, project),
1090+
"file_id": fid,
10541091
"total_neighbors": 0,
10551092
"truncated": False,
10561093
"neighbors": [],

tests/mcp/test_impact_analysis.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,25 @@ async def test_impact_analysis_registered_via_app():
6969

7070

7171
async def _find_id(indexed_fixture, name: str) -> int:
72-
from api.mcp.tools.structural import search_code
72+
"""Resolve a symbol name to its int node id directly from the graph.
7373
74-
rows = await search_code(
75-
prefix=name,
76-
project=indexed_fixture.project,
77-
branch=indexed_fixture.branch,
78-
)
79-
for r in rows:
80-
if r["name"] == name:
81-
return r["id"]
82-
raise AssertionError(f"symbol {name!r} not found")
74+
``search_code`` is file-oriented and no longer returns per-symbol ids.
75+
"""
76+
from api.mcp.tools.structural import _project_arg
77+
78+
g = _project_arg(indexed_fixture.project, indexed_fixture.branch)
79+
try:
80+
res = await g._query(
81+
"MATCH (n) WHERE (n:Function OR n:Class) AND n.name = $name "
82+
"RETURN ID(n)",
83+
{"name": name},
84+
)
85+
finally:
86+
await g.close()
87+
rows = res.result_set
88+
assert rows, f"symbol {name!r} not found in graph"
89+
assert len(rows) == 1, f"ambiguous symbol {name!r}: {len(rows)} matches"
90+
return rows[0][0]
8391

8492

8593
async def test_impact_upstream_of_db(indexed_fixture, expected_contract):

tests/mcp/test_query_tools.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,23 @@ def anyio_backend() -> str:
2828
async def test_search_code_finds_entrypoint(indexed_fixture, expected_contract):
2929
from api.mcp.tools.structural import search_code
3030

31+
# search_code is file-oriented: a free-text query naming a symbol must
32+
# surface the file that defines it.
33+
symbol = expected_contract["search_prefixes"]["ent"]["must_include"][0]
3134
results = await search_code(
32-
prefix="ent",
35+
query=symbol,
3336
project=indexed_fixture.project,
3437
branch=indexed_fixture.branch,
3538
)
36-
names = {r["name"] for r in results}
37-
for required in expected_contract["search_prefixes"]["ent"]["must_include"]:
38-
assert required in names, f"expected {required} in {names}"
39+
files = [r["file"] for r in results if r.get("file")]
40+
assert any(f.endswith(f"{symbol}.py") for f in files), files
3941

4042

4143
async def test_search_code_honors_limit(indexed_fixture):
4244
from api.mcp.tools.structural import search_code
4345

4446
results = await search_code(
45-
prefix="r", # broad prefix
47+
query="entrypoint service repo db", # broad: matches several files
4648
project=indexed_fixture.project,
4749
branch=indexed_fixture.branch,
4850
limit=1,
@@ -54,7 +56,7 @@ async def test_search_code_empty_for_nonsense(indexed_fixture):
5456
from api.mcp.tools.structural import search_code
5557

5658
results = await search_code(
57-
prefix="zzz_no_such_symbol_zzz",
59+
query="zzz_no_such_symbol_zzz",
5860
project=indexed_fixture.project,
5961
branch=indexed_fixture.branch,
6062
)
@@ -65,7 +67,7 @@ async def test_search_code_result_serialisable(indexed_fixture):
6567
from api.mcp.tools.structural import search_code
6668

6769
results = await search_code(
68-
prefix="serv",
70+
query="service",
6971
project=indexed_fixture.project,
7072
branch=indexed_fixture.branch,
7173
)
@@ -101,7 +103,7 @@ async def test_search_code_returns_relative_paths(indexed_fixture):
101103
from api.mcp.tools.structural import search_code
102104

103105
results = await search_code(
104-
prefix="ent",
106+
query="entrypoint",
105107
project=indexed_fixture.project,
106108
branch=indexed_fixture.branch,
107109
)
@@ -112,20 +114,20 @@ async def test_search_code_returns_relative_paths(indexed_fixture):
112114

113115

114116
async def test_search_code_ranks_exact_match_within_limit(indexed_fixture, expected_contract):
115-
"""An exact name==prefix match must survive the ``[:limit]`` cut and rank
116-
ahead of the looser prefix matches."""
117+
"""A query naming a symbol must surface that symbol's file as the top hit,
118+
with the matching symbol as the file's representative."""
117119
from api.mcp.tools.structural import search_code
118120

119-
# pick a known symbol from the contract and query its exact name
120-
exact = next(iter(expected_contract["search_prefixes"]["ent"]["must_include"]))
121+
symbol = next(iter(expected_contract["search_prefixes"]["ent"]["must_include"]))
121122
results = await search_code(
122-
prefix=exact,
123+
query=symbol,
123124
project=indexed_fixture.project,
124125
branch=indexed_fixture.branch,
125126
limit=1,
126127
)
127-
assert results, f"no results for exact prefix {exact!r}"
128-
assert results[0]["name"] == exact
128+
assert results, f"no results for query {symbol!r}"
129+
assert results[0]["file"].endswith(f"{symbol}.py")
130+
assert results[0]["name"] == symbol
129131

130132

131133
# ---------------------------------------------------------------------------
@@ -134,18 +136,28 @@ async def test_search_code_ranks_exact_match_within_limit(indexed_fixture, expec
134136

135137

136138
async def _find_id(indexed_fixture, name: str) -> int:
137-
"""Helper: resolve a symbol name to its int node id via search_code."""
138-
from api.mcp.tools.structural import search_code
139+
"""Resolve a symbol name to its int node id directly from the graph.
139140
140-
rows = await search_code(
141-
prefix=name,
142-
project=indexed_fixture.project,
143-
branch=indexed_fixture.branch,
144-
)
145-
for r in rows:
146-
if r["name"] == name:
147-
return r["id"]
148-
raise AssertionError(f"symbol {name!r} not found via search_code")
141+
``search_code`` is file-oriented and no longer returns per-symbol ids, so
142+
the neighbor/path/impact tests resolve the id straight from FalkorDB. The
143+
names used here (entrypoint/service/db) are unique Functions in the fixture,
144+
so a uniqueness assertion guards against silently picking the wrong node.
145+
"""
146+
from api.mcp.tools.structural import _project_arg
147+
148+
g = _project_arg(indexed_fixture.project, indexed_fixture.branch)
149+
try:
150+
res = await g._query(
151+
"MATCH (n) WHERE (n:Function OR n:Class) AND n.name = $name "
152+
"RETURN ID(n)",
153+
{"name": name},
154+
)
155+
finally:
156+
await g.close()
157+
rows = res.result_set
158+
assert rows, f"symbol {name!r} not found in graph"
159+
assert len(rows) == 1, f"ambiguous symbol {name!r}: {len(rows)} matches"
160+
return rows[0][0]
149161

150162

151163
async def test_get_callees_of_entrypoint(indexed_fixture, expected_contract):

0 commit comments

Comments
 (0)