@@ -8384,6 +8384,60 @@ def _sources_metadata(refs: list) -> list:
83848384 return [{"n": i + 1, "title": _t, "url": _u} for i, (_t, _u) in enumerate(refs)]
83858385
83868386
8387+ def _sources_annotations(refs: list, text: str) -> list:
8388+ """OpenAI url_citation annotations (Chat/Responses parity): one
8389+ {type:'url_citation', url, title, start_index, end_index} per cited source.
8390+ start/end are char offsets into `text` where the URL appears inline (so a UI
8391+ renders a clickable cite); 0/0 when the source is a turn-source not inlined.
8392+ This is OpenAI's canonical citation contract -- attaching it lets MiOS clients
8393+ render web citations the same way ChatGPT does. web-tools hardening 2026-06-21."""
8394+ out: list = []
8395+ _txt = text or ""
8396+ for _ref in (refs or []):
8397+ try:
8398+ _t, _u = _ref
8399+ except (ValueError, TypeError):
8400+ continue
8401+ if not _u:
8402+ continue
8403+ _i = _txt.find(_u)
8404+ out.append({
8405+ "type": "url_citation",
8406+ "url": _u,
8407+ "title": (_t or _u),
8408+ "start_index": (_i if _i >= 0 else 0),
8409+ "end_index": (_i + len(_u) if _i >= 0 else 0),
8410+ })
8411+ return out
8412+
8413+
8414+ def _filter_relevant_sources(refs: list, *texts: str) -> list:
8415+ """OpenAI grounding rule: 'include only search results/citations that support
8416+ the cited response text -- irrelevant sources permanently degrade user trust.'
8417+ Keep a source only when its title shares a content word (>=4 chars) with the
8418+ answer/query, OR its registrable-domain stem appears in them. DEGRADE-OPEN: if
8419+ the filter would drop EVERYTHING (the answer echoed no source token), return the
8420+ originals -- never strip citations to empty. Kills the off-topic-source bleed
8421+ (a Fedora answer citing 'Shaolin monks'). web-tools hardening 2026-06-21."""
8422+ if not refs:
8423+ return refs
8424+ _blob = " ".join(t for t in texts if t).lower()
8425+ if not _blob:
8426+ return refs
8427+ _kept: list = []
8428+ for _ref in refs:
8429+ try:
8430+ _t, _u = _ref
8431+ except (ValueError, TypeError):
8432+ continue
8433+ _dom = re.sub(r"^https?://(www\.)?", "", str(_u or "")).split("/")[0].lower()
8434+ _stem = _dom.split(".")[0] if _dom else ""
8435+ _words = set(re.findall(r"[a-z0-9]{4,}", str(_t or "").lower()))
8436+ if (_stem and len(_stem) >= 4 and _stem in _blob) or any(_w in _blob for _w in _words):
8437+ _kept.append((_t, _u))
8438+ return _kept if _kept else refs
8439+
8440+
83878441# Parse a sub-agent's appended '**Sources:**\nN. title — url' block (or any bare
83888442# http URLs) back into citable items. A council/DAG facet is dispatched to a leaf
83898443# agent (hermes/opencode) that re-calls :8640 WITHOUT the turn-id header, so its
@@ -17221,6 +17275,7 @@ async def _run_ground() -> None:
1722117275 except Exception: # noqa: BLE001
1722217276 pass
1722317277 _dag_refs = _src_collected()
17278+ _dag_refs = _filter_relevant_sources(_dag_refs, main)
1722417279 if _dag_refs and "**Sources:**" not in main:
1722517280 main = main.rstrip() + _sources_markdown(_dag_refs)
1722617281 yield _sse_reasoning(envelope + "\n", chat_id=chat_id, model=model)
@@ -17256,13 +17311,17 @@ async def _run_ground() -> None:
1725617311 except Exception: # noqa: BLE001
1725717312 pass
1725817313 _dag_refs = _src_collected()
17314+ # OpenAI grounding: drop off-topic sources before citing. web-tools hardening.
17315+ _dag_refs = _filter_relevant_sources(_dag_refs, main)
1725917316 if _dag_refs and "**Sources:**" not in main:
1726017317 main = main.rstrip() + _sources_markdown(_dag_refs)
1726117318 return JSONResponse(content={
1726217319 "id": chat_id, "object": "chat.completion",
1726317320 "created": int(time.time()), "model": model,
1726417321 "choices": [{"index": 0,
17265- "message": {"role": "assistant", "content": main},
17322+ "message": {"role": "assistant", "content": main,
17323+ # OpenAI url_citation annotations.
17324+ "annotations": _sources_annotations(_dag_refs, main)},
1726617325 "finish_reason": "stop"}],
1726717326 "usage": _usage_estimate(last_user_text, main), # P4 /v1 conformance
1726817327 "mios_sources": _sources_metadata(_dag_refs) if _dag_refs else [],
@@ -26332,6 +26391,10 @@ async def _work() -> None:
2633226391 except Exception: # noqa: BLE001
2633326392 pass
2633426393 _refs = _src_collected() or _refs
26394+ # OpenAI grounding: keep ONLY sources that support the answer -- drop the
26395+ # off-topic bleed (a Fedora answer must not cite 'Shaolin monks') before any
26396+ # citation surface. web-tools hardening 2026-06-21.
26397+ _refs = _filter_relevant_sources(_refs, _ans, last_user_text)
2633526398 if _refs and _ans and _ans.strip() and "**Sources:**" not in _ans:
2633626399 _append = _sources_markdown(_refs)
2633726400 if _append:
@@ -26360,7 +26423,10 @@ async def _stream_native() -> AsyncGenerator[bytes, None]:
2636026423 "id": chat_id, "object": "chat.completion",
2636126424 "created": int(time.time()), "model": model,
2636226425 "choices": [{"index": 0,
26363- "message": {"role": "assistant", "content": _ans},
26426+ "message": {"role": "assistant", "content": _ans,
26427+ # OpenAI url_citation annotations (canonical
26428+ # citation contract). web-tools hardening 2026-06-21.
26429+ "annotations": _sources_annotations(_refs, _ans)},
2636426430 "finish_reason": "stop"}],
2636526431 "usage": _usage_estimate(last_user_text, _ans),
2636626432 "mios_sources": _sources_metadata(_refs) if _refs else [],
@@ -28922,6 +28988,7 @@ def _finished_sec_status() -> list:
2892228988 except Exception: # noqa: BLE001
2892328989 pass
2892428990 _stream_refs = _src_collected()
28991+ _stream_refs = _filter_relevant_sources(_stream_refs, wrapped)
2892528992 if _stream_refs and "**Sources:**" not in wrapped:
2892628993 wrapped = wrapped.rstrip() + _sources_markdown(_stream_refs)
2892728994 yield _sse_chunk("", chat_id=chat_id, model=model,
@@ -29159,10 +29226,17 @@ def _sec_body(_n, _c):
2915929226 except Exception: # noqa: BLE001
2916029227 pass
2916129228 _refs = _src_collected()
29229+ # OpenAI grounding: keep ONLY sources that support the answer
29230+ # (drop the off-topic bleed) before citing. web-tools hardening
29231+ # 2026-06-21.
29232+ _refs = _filter_relevant_sources(_refs, wrapped, last_user_text)
2916229233 if _refs and "**Sources:**" not in wrapped:
2916329234 wrapped = wrapped.rstrip() + _sources_markdown(_refs)
2916429235 if _refs:
2916529236 backend_json["mios_sources"] = _sources_metadata(_refs)
29237+ # OpenAI url_citation annotations -- the canonical citation
29238+ # contract so clients render clickable web cites.
29239+ msg["annotations"] = _sources_annotations(_refs, wrapped)
2916629240 msg["content"] = wrapped
2916729241 choices[0]["message"] = msg
2916829242 backend_json["choices"] = choices
0 commit comments