Skip to content

Commit b624f83

Browse files
committed
fix(comses): language heuristic + tighter read default
Two bugs caught by live end-to-end smoke test in Claude Code: 1. search_comses returned language=null for every result. Real COMSES search responses don't include releaseLanguages on nested releases — that field only lives on /releases/{v}/?format=json. Added a cheap text-heuristic (scan title + description + tags for "netlogo", "python", "mesa", etc.) as a fallback. Honest: returns None when no hint is present; the authoritative language is on get_comses_model. 2. read_comses_files default max_total_bytes=200_000 blew past the MCP harness's per-response token budget on a single 57KB .nlogo file (JSON-escaped strings balloon ~25%). Dropped default to 50_000 with a docstring note that larger pulls should pass the value explicitly. 2 new tests: direct unit test of _language_hint_from_text, and an integration test proving search_comses still populates language when releaseLanguages is absent (real-API shape). 87 tests pass, lint + mypy clean.
1 parent 25a12b7 commit b624f83

2 files changed

Lines changed: 95 additions & 9 deletions

File tree

src/netlogo_mcp/tools.py

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -478,8 +478,12 @@ def _compact_search_result(entry: dict) -> dict:
478478
if name:
479479
authors.append(name)
480480

481-
# Language comes from the latest release's releaseLanguages, if present.
481+
# Language: search results don't include releaseLanguages, so we fall back
482+
# to a text heuristic over title + description + tags. This is surfaced as
483+
# a hint only — the authoritative language lives on get_comses_model.
482484
language = _language_from_releases(entry.get("releases") or [])
485+
if not language:
486+
language = _language_hint_from_text(entry)
483487

484488
return {
485489
"identifier": entry.get("identifier"),
@@ -501,10 +505,16 @@ def _compact_search_result(entry: dict) -> dict:
501505

502506

503507
def _language_from_releases(releases: list) -> str | None:
504-
"""Pick a language name from the release with the latest version, if any."""
508+
"""Pick a language name from a release detail, if the field is populated.
509+
510+
Real COMSES search results do NOT include `releaseLanguages` on nested
511+
releases — that data is only on `/releases/{version}/?format=json`.
512+
This helper still handles the full shape for detail responses and
513+
mocked tests; callers that only have search results should also try
514+
`_language_hint_from_text`.
515+
"""
505516
if not releases:
506517
return None
507-
# Prefer the release flagged as latest, else the last one.
508518
target = None
509519
for rel in releases:
510520
if (rel or {}).get("latestVersion"):
@@ -518,14 +528,44 @@ def _language_from_releases(releases: list) -> str | None:
518528
name = pl.get("name") or (lang or {}).get("name")
519529
if name:
520530
return str(name)
521-
# Fallback: programmingLanguageTags[] or platforms[]
522531
tags = (target or {}).get("programmingLanguageTags") or []
523532
if tags:
524533
first = tags[0]
525534
return first.get("name") if isinstance(first, dict) else str(first)
526535
return None
527536

528537

538+
# Keyword → display name, scanned case-insensitively against title /
539+
# description / tags. Keep multi-word and ambiguous keywords out of this
540+
# list so we don't mis-tag ecology models as "R".
541+
_LANGUAGE_TEXT_HINTS: tuple[tuple[str, str], ...] = (
542+
("netlogo", "NetLogo"),
543+
("mesa", "Python"),
544+
("repast", "Repast"),
545+
("python", "Python"),
546+
("julia", "Julia"),
547+
("matlab", "MATLAB"),
548+
("gama platform", "GAMA"),
549+
("gama-platform", "GAMA"),
550+
)
551+
552+
553+
def _language_hint_from_text(entry: dict) -> str | None:
554+
"""Cheap heuristic: scan title + description + tags for a known language."""
555+
parts: list[str] = [
556+
str(entry.get("title") or ""),
557+
str(entry.get("summarizedDescription") or ""),
558+
str(entry.get("description") or ""),
559+
]
560+
for t in entry.get("tags") or []:
561+
parts.append(str(t.get("name") if isinstance(t, dict) else t))
562+
haystack = " ".join(parts).lower()
563+
for needle, label in _LANGUAGE_TEXT_HINTS:
564+
if needle in haystack:
565+
return label
566+
return None
567+
568+
529569
@mcp.tool()
530570
async def search_comses(
531571
ctx: Context,
@@ -890,7 +930,7 @@ async def read_comses_files(
890930
identifier: str,
891931
version: str = "latest",
892932
extensions: list[str] | None = None,
893-
max_total_bytes: int = 200_000,
933+
max_total_bytes: int = 50_000,
894934
) -> str:
895935
"""Return text contents of source and documentation files from a
896936
downloaded COMSES model.
@@ -912,10 +952,11 @@ async def read_comses_files(
912952
a string (may contain replacement characters for non-text bytes).
913953
- Files are included in priority order: ODD docs → NetLogo source →
914954
other code → other .md/.txt → everything else matching extensions.
915-
- Total body is capped at `max_total_bytes` (default 200 KB). When the
916-
cap is hit mid-file, that file is truncated at a line boundary;
917-
subsequent files are listed in `omitted_files` with reason
918-
`byte_cap_reached`.
955+
- Total body is capped at `max_total_bytes` (default 50 KB — sized to
956+
fit in a single conversational-LLM tool response). When the cap is
957+
hit mid-file, that file is truncated at a line boundary; subsequent
958+
files are listed in `omitted_files` with reason `byte_cap_reached`.
959+
For larger pulls, pass a higher value explicitly.
919960
- Files matching no `extensions` filter are listed in `omitted_files`
920961
with reason `extension_not_in_filter`.
921962

tests/test_comses.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,3 +1004,48 @@ def test_explore_comses_prompt_has_required_rules():
10041004
# Stop-and-ask fallback and no auto-translation.
10051005
assert "Stop-and-ask" in body or "stop-and-ask" in body.lower()
10061006
assert "Do NOT auto-translate" in body or "not auto-translate" in body.lower()
1007+
1008+
1009+
# ── Language hint heuristic (real COMSES search results omit releaseLanguages)
1010+
1011+
1012+
def test_language_hint_from_title_and_description():
1013+
from netlogo_mcp.tools import _language_hint_from_text
1014+
1015+
assert _language_hint_from_text({"title": "Wolf Sheep Netlogo Model"}) == "NetLogo"
1016+
assert (
1017+
_language_hint_from_text({"description": "Implemented in Python using Mesa."})
1018+
== "Python"
1019+
)
1020+
assert _language_hint_from_text({"tags": [{"name": "Repast"}]}) == "Repast"
1021+
assert _language_hint_from_text({"title": "ecology of wolves"}) is None
1022+
1023+
1024+
@pytest.mark.asyncio
1025+
async def test_search_comses_falls_back_to_heuristic_when_release_langs_absent(
1026+
monkeypatch,
1027+
):
1028+
"""Real COMSES search results don't include releaseLanguages.
1029+
1030+
The compact response must still get a language when the text mentions one.
1031+
"""
1032+
from netlogo_mcp import tools
1033+
1034+
stripped = json.loads(json.dumps(_fx("search_result.json")))
1035+
# Strip releaseLanguages to simulate real API.
1036+
for r in stripped["results"]:
1037+
for rel in r.get("releases") or []:
1038+
rel["releaseLanguages"] = []
1039+
1040+
def handler(request: httpx.Request) -> httpx.Response:
1041+
return httpx.Response(200, json=stripped)
1042+
1043+
_patch_client_factory(monkeypatch, handler)
1044+
1045+
ctx = MagicMock()
1046+
raw = await tools.search_comses(ctx, query="x", page=1)
1047+
data = json.loads(raw)
1048+
# "Wolf Sheep Predation" + "NetLogo" tag → picked up via heuristic.
1049+
assert data["results"][0]["language"] == "NetLogo"
1050+
# Second result has "Python" in tags + description.
1051+
assert data["results"][1]["language"] == "Python"

0 commit comments

Comments
 (0)