Skip to content

Commit 54c39d9

Browse files
author
SqlRush
committed
Skip WebFetch media fallback text
1 parent ff52ad0 commit 54c39d9

4 files changed

Lines changed: 30 additions & 9 deletions

File tree

docs/cc-100-roadmap.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ M7 补充:prompt history `LogEntry` 读取现在接受 `sessionID`/`session`/`
957957

958958
本轮补充:`WebFetch` HTML-to-text rendering 现在会先解析 `<base href>` 再跳过 `<head>` 子树,`<title>` 等 head-only metadata 不再污染 rendered body 或 prompt-focused excerpt,同时不破坏相对链接解析。
959959

960-
本轮补充:`WebFetch` HTML-to-text rendering 现在会跳过 `<iframe>` fallback 子树,iframe 内的备用文本不会被当成当前页面可见正文或 prompt-focused excerpt。
960+
本轮补充:`WebFetch` HTML-to-text rendering 现在会跳过 `<iframe>``<audio>``<video>` fallback 子树,嵌入内容的备用文本不会被当成当前页面可见正文或 prompt-focused excerpt。
961961

962962
本轮补充:`WebSearch` JSON parser 现在会递归解包 `web``response``search``hits``documents``records``entries` 等常见搜索后端 wrapper,并继续保留 URL 去重和 allowed/blocked domain filter。
963963

docs/claude-code-go-rewrite-plan.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ test/parity/ # golden tests against TS/official behavior
203203
- 本轮补充:WebFetch HTML-to-text rendering 现在会为无可见文本但带 `aria-label`/`title` 的链接保留可访问名称和 resolved href,icon-only 链接可进入 rendered body 与 prompt-focused excerpt。
204204
- 本轮补充:WebFetch HTML-to-text rendering 现在按浏览器可见性处理 closed `<details>``<dialog>`:closed details 只渲染第一个 summary 子树,隐藏正文不会进入 excerpt;未带 `open` 的 dialog 会作为不可见子树跳过,open details/dialog 仍正常渲染。
205205
- 本轮补充:WebFetch HTML-to-text rendering 现在会先解析 `<base href>` 再跳过 `<head>` 子树,`<title>` 等 head-only metadata 不再污染 rendered body 或 prompt-focused excerpt,同时不破坏相对链接解析。
206-
- 本轮补充:WebFetch HTML-to-text rendering 现在会跳过 `<iframe>` fallback 子树,iframe 内的备用文本不会被当成当前页面可见正文或 prompt-focused excerpt。
206+
- 本轮补充:WebFetch HTML-to-text rendering 现在会跳过 `<iframe>``<audio>``<video>` fallback 子树,嵌入内容的备用文本不会被当成当前页面可见正文或 prompt-focused excerpt。
207207
- 本轮补充:WebSearch HTML 结果解析现在会按搜索页首个有效 `<base href>` 解析相对结果 anchor,覆盖镜像/自定义搜索页中浏览器可见结果 URL 与请求路径不一致的情况。
208208
- 本轮补充:WebSearch HTML 结果解析现在会读取 `application/ld+json` JSON-LD 结果,递归抽取 `@graph``ItemList.itemListElement.item`,支持 JSON-LD `@id` URL alias,并与后续 anchor 结果按 URL 去重。
209209
- 本轮补充:WebSearch HTML snippet 提取现在识别 Bing 风格 `b_caption`/`b_snippet` 以及常见搜索摘要 class,标题 anchor 后的可见摘要会进入文本输出和 structured result。

internal/tools/web/web_fetch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ func renderWebFetchBody(contentType string, body string, baseURL string) (string
442442
if !isHTMLWebFetchContent(contentType, body) {
443443
return body, false
444444
}
445-
stripped := removeHTMLWebFetchBlocks(body, "script", "style", "noscript", "template", "svg", "canvas", "iframe")
445+
stripped := removeHTMLWebFetchBlocks(body, "script", "style", "noscript", "template", "svg", "canvas", "iframe", "audio", "video")
446446
resolvedBaseURL := webFetchHTMLBaseURL(stripped, baseURL)
447447
stripped = removeHTMLWebFetchBlocks(stripped, "head")
448448
rendered := stripHTMLWebFetchTags(stripped, resolvedBaseURL)

internal/tools/web/web_fetch_test.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ func TestWebFetchHTMLRenderingHonorsDetailsAndDialogVisibility(t *testing.T) {
399399
}
400400
}
401401

402-
func TestWebFetchHTMLRenderingSkipsIframeFallbackContent(t *testing.T) {
402+
func TestWebFetchHTMLRenderingSkipsEmbeddedFallbackContent(t *testing.T) {
403403
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
404404
w.Header().Set("Content-Type", "text/html; charset=utf-8")
405405
_, _ = w.Write([]byte(`<!doctype html>
@@ -409,7 +409,13 @@ func TestWebFetchHTMLRenderingSkipsIframeFallbackContent(t *testing.T) {
409409
<iframe src="/embedded">
410410
<p>Hidden iframe pricing leak should not render.</p>
411411
</iframe>
412-
<p>Visible iframe pricing guidance appears in the page body.</p>
412+
<audio controls>
413+
Hidden audio pricing leak should not render.
414+
</audio>
415+
<video controls>
416+
Hidden video pricing leak should not render.
417+
</video>
418+
<p>Visible embedded pricing guidance appears in the page body.</p>
413419
</main>
414420
</body>
415421
</html>`))
@@ -425,16 +431,31 @@ func TestWebFetchHTMLRenderingSkipsIframeFallbackContent(t *testing.T) {
425431
t.Fatal(err)
426432
}
427433
rendered, ok := result.StructuredContent["rendered_body"].(string)
428-
if !ok || !strings.Contains(rendered, "Visible iframe pricing guidance") {
434+
if !ok || !strings.Contains(rendered, "Visible embedded pricing guidance") {
429435
t.Fatalf("rendered body = %#v", result.StructuredContent["rendered_body"])
430436
}
431-
if strings.Contains(rendered, "Hidden iframe pricing leak") {
432-
t.Fatalf("rendered body leaked iframe fallback text: %#v", rendered)
437+
for _, leaked := range []string{
438+
"Hidden iframe pricing leak",
439+
"Hidden audio pricing leak",
440+
"Hidden video pricing leak",
441+
} {
442+
if strings.Contains(rendered, leaked) {
443+
t.Fatalf("rendered body leaked embedded fallback text %q: %#v", leaked, rendered)
444+
}
433445
}
434446
excerpt, ok := result.StructuredContent["prompt_excerpt"].(string)
435-
if !ok || !strings.Contains(excerpt, "Visible iframe pricing guidance") || strings.Contains(excerpt, "Hidden iframe pricing leak") {
447+
if !ok || !strings.Contains(excerpt, "Visible embedded pricing guidance") {
436448
t.Fatalf("prompt excerpt = %#v", result.StructuredContent["prompt_excerpt"])
437449
}
450+
for _, leaked := range []string{
451+
"Hidden iframe pricing leak",
452+
"Hidden audio pricing leak",
453+
"Hidden video pricing leak",
454+
} {
455+
if strings.Contains(excerpt, leaked) {
456+
t.Fatalf("prompt excerpt used embedded fallback text %q: %#v", leaked, excerpt)
457+
}
458+
}
438459
}
439460

440461
func TestWebFetchHTMLRenderingPreservesLinksAndImageText(t *testing.T) {

0 commit comments

Comments
 (0)