Skip to content

Commit afdc9a3

Browse files
mios-devclaude
andcommitted
harden(web-tools): OpenAI url_citation annotations + relevance-filter all source paths
Web-tools test (live): web_search/crawl/web_scrape all return CURRENT data (Fedora 44) but the answer cited off-topic junk (a Fedora answer citing "Shaolin monks"). Researched OpenAI's current web-tool conventions and conformed the citation surface: - _filter_relevant_sources(): OpenAI grounding rule "include only sources that support the response; irrelevant sources permanently degrade trust." Keeps a source only if its title shares a >=4-char word with the answer/query or its domain stem appears there; DEGRADE-OPEN (never strips to empty). Applied at ALL five source-attach paths (native-loop, polish/proxy, DAG streaming + non-stream, native streaming) so no path leaks off-topic citations. - _sources_annotations(): emits OpenAI's canonical url_citation annotations ({type,url,title,start_index,end_index} with real char offsets into the answer) on the non-streaming message, alongside the existing mios_sources + Sources md. Live-verified: "current Fedora release" -> "Fedora Workstation 44 (April 28 2026)", 5 url_citation annotations, all sources Fedora-relevant (junk gone). web-tools hardening 2026-06-21. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 532cf7c commit afdc9a3

1 file changed

Lines changed: 76 additions & 2 deletions

File tree

usr/lib/mios/agent-pipe/server.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8384,6 +8384,60 @@ def _sources_metadata(refs: list) -> list:
83848384
return [{"n": i + 1, "title": _t, "url": _u} for i, (_t, _u) in enumerate(refs)]
83858385

83868386

8387+
def _sources_annotations(refs: list, text: str) -> list:
8388+
"""OpenAI url_citation annotations (Chat/Responses parity): one
8389+
{type:'url_citation', url, title, start_index, end_index} per cited source.
8390+
start/end are char offsets into `text` where the URL appears inline (so a UI
8391+
renders a clickable cite); 0/0 when the source is a turn-source not inlined.
8392+
This is OpenAI's canonical citation contract -- attaching it lets MiOS clients
8393+
render web citations the same way ChatGPT does. web-tools hardening 2026-06-21."""
8394+
out: list = []
8395+
_txt = text or ""
8396+
for _ref in (refs or []):
8397+
try:
8398+
_t, _u = _ref
8399+
except (ValueError, TypeError):
8400+
continue
8401+
if not _u:
8402+
continue
8403+
_i = _txt.find(_u)
8404+
out.append({
8405+
"type": "url_citation",
8406+
"url": _u,
8407+
"title": (_t or _u),
8408+
"start_index": (_i if _i >= 0 else 0),
8409+
"end_index": (_i + len(_u) if _i >= 0 else 0),
8410+
})
8411+
return out
8412+
8413+
8414+
def _filter_relevant_sources(refs: list, *texts: str) -> list:
8415+
"""OpenAI grounding rule: 'include only search results/citations that support
8416+
the cited response text -- irrelevant sources permanently degrade user trust.'
8417+
Keep a source only when its title shares a content word (>=4 chars) with the
8418+
answer/query, OR its registrable-domain stem appears in them. DEGRADE-OPEN: if
8419+
the filter would drop EVERYTHING (the answer echoed no source token), return the
8420+
originals -- never strip citations to empty. Kills the off-topic-source bleed
8421+
(a Fedora answer citing 'Shaolin monks'). web-tools hardening 2026-06-21."""
8422+
if not refs:
8423+
return refs
8424+
_blob = " ".join(t for t in texts if t).lower()
8425+
if not _blob:
8426+
return refs
8427+
_kept: list = []
8428+
for _ref in refs:
8429+
try:
8430+
_t, _u = _ref
8431+
except (ValueError, TypeError):
8432+
continue
8433+
_dom = re.sub(r"^https?://(www\.)?", "", str(_u or "")).split("/")[0].lower()
8434+
_stem = _dom.split(".")[0] if _dom else ""
8435+
_words = set(re.findall(r"[a-z0-9]{4,}", str(_t or "").lower()))
8436+
if (_stem and len(_stem) >= 4 and _stem in _blob) or any(_w in _blob for _w in _words):
8437+
_kept.append((_t, _u))
8438+
return _kept if _kept else refs
8439+
8440+
83878441
# Parse a sub-agent's appended '**Sources:**\nN. title — url' block (or any bare
83888442
# http URLs) back into citable items. A council/DAG facet is dispatched to a leaf
83898443
# agent (hermes/opencode) that re-calls :8640 WITHOUT the turn-id header, so its
@@ -17221,6 +17275,7 @@ async def _run_ground() -> None:
1722117275
except Exception: # noqa: BLE001
1722217276
pass
1722317277
_dag_refs = _src_collected()
17278+
_dag_refs = _filter_relevant_sources(_dag_refs, main)
1722417279
if _dag_refs and "**Sources:**" not in main:
1722517280
main = main.rstrip() + _sources_markdown(_dag_refs)
1722617281
yield _sse_reasoning(envelope + "\n", chat_id=chat_id, model=model)
@@ -17256,13 +17311,17 @@ async def _run_ground() -> None:
1725617311
except Exception: # noqa: BLE001
1725717312
pass
1725817313
_dag_refs = _src_collected()
17314+
# OpenAI grounding: drop off-topic sources before citing. web-tools hardening.
17315+
_dag_refs = _filter_relevant_sources(_dag_refs, main)
1725917316
if _dag_refs and "**Sources:**" not in main:
1726017317
main = main.rstrip() + _sources_markdown(_dag_refs)
1726117318
return JSONResponse(content={
1726217319
"id": chat_id, "object": "chat.completion",
1726317320
"created": int(time.time()), "model": model,
1726417321
"choices": [{"index": 0,
17265-
"message": {"role": "assistant", "content": main},
17322+
"message": {"role": "assistant", "content": main,
17323+
# OpenAI url_citation annotations.
17324+
"annotations": _sources_annotations(_dag_refs, main)},
1726617325
"finish_reason": "stop"}],
1726717326
"usage": _usage_estimate(last_user_text, main), # P4 /v1 conformance
1726817327
"mios_sources": _sources_metadata(_dag_refs) if _dag_refs else [],
@@ -26332,6 +26391,10 @@ async def _work() -> None:
2633226391
except Exception: # noqa: BLE001
2633326392
pass
2633426393
_refs = _src_collected() or _refs
26394+
# OpenAI grounding: keep ONLY sources that support the answer -- drop the
26395+
# off-topic bleed (a Fedora answer must not cite 'Shaolin monks') before any
26396+
# citation surface. web-tools hardening 2026-06-21.
26397+
_refs = _filter_relevant_sources(_refs, _ans, last_user_text)
2633526398
if _refs and _ans and _ans.strip() and "**Sources:**" not in _ans:
2633626399
_append = _sources_markdown(_refs)
2633726400
if _append:
@@ -26360,7 +26423,10 @@ async def _stream_native() -> AsyncGenerator[bytes, None]:
2636026423
"id": chat_id, "object": "chat.completion",
2636126424
"created": int(time.time()), "model": model,
2636226425
"choices": [{"index": 0,
26363-
"message": {"role": "assistant", "content": _ans},
26426+
"message": {"role": "assistant", "content": _ans,
26427+
# OpenAI url_citation annotations (canonical
26428+
# citation contract). web-tools hardening 2026-06-21.
26429+
"annotations": _sources_annotations(_refs, _ans)},
2636426430
"finish_reason": "stop"}],
2636526431
"usage": _usage_estimate(last_user_text, _ans),
2636626432
"mios_sources": _sources_metadata(_refs) if _refs else [],
@@ -28922,6 +28988,7 @@ def _finished_sec_status() -> list:
2892228988
except Exception: # noqa: BLE001
2892328989
pass
2892428990
_stream_refs = _src_collected()
28991+
_stream_refs = _filter_relevant_sources(_stream_refs, wrapped)
2892528992
if _stream_refs and "**Sources:**" not in wrapped:
2892628993
wrapped = wrapped.rstrip() + _sources_markdown(_stream_refs)
2892728994
yield _sse_chunk("", chat_id=chat_id, model=model,
@@ -29159,10 +29226,17 @@ def _sec_body(_n, _c):
2915929226
except Exception: # noqa: BLE001
2916029227
pass
2916129228
_refs = _src_collected()
29229+
# OpenAI grounding: keep ONLY sources that support the answer
29230+
# (drop the off-topic bleed) before citing. web-tools hardening
29231+
# 2026-06-21.
29232+
_refs = _filter_relevant_sources(_refs, wrapped, last_user_text)
2916229233
if _refs and "**Sources:**" not in wrapped:
2916329234
wrapped = wrapped.rstrip() + _sources_markdown(_refs)
2916429235
if _refs:
2916529236
backend_json["mios_sources"] = _sources_metadata(_refs)
29237+
# OpenAI url_citation annotations -- the canonical citation
29238+
# contract so clients render clickable web cites.
29239+
msg["annotations"] = _sources_annotations(_refs, wrapped)
2916629240
msg["content"] = wrapped
2916729241
choices[0]["message"] = msg
2916829242
backend_json["choices"] = choices

0 commit comments

Comments
 (0)