Skip to content

Commit ad6a8a1

Browse files
authored
fix: use htmlrewriter2 instead of HTMLRewriter for node compat (anomalyco#26309)
1 parent 46daede commit ad6a8a1

4 files changed

Lines changed: 42 additions & 29 deletions

File tree

bun.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/opencode/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@
141141
"glob": "13.0.5",
142142
"google-auth-library": "10.5.0",
143143
"gray-matter": "4.0.3",
144+
"htmlparser2": "8.0.2",
144145
"ignore": "7.0.5",
145146
"immer": "11.1.4",
146147
"jsonc-parser": "3.3.1",

packages/opencode/src/tool/webfetch.ts

Lines changed: 21 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { Effect, Schema } from "effect"
22
import { HttpClient, HttpClientRequest } from "effect/unstable/http"
3+
import { Parser } from "htmlparser2"
34
import * as Tool from "./tool"
45
import TurndownService from "turndown"
56
import DESCRIPTION from "./webfetch.txt"
@@ -139,8 +140,7 @@ export const WebFetchTool = Tool.define(
139140

140141
case "text":
141142
if (contentType.includes("text/html")) {
142-
const text = yield* Effect.promise(() => extractTextFromHTML(content))
143-
return { output: text, title, metadata: {} }
143+
return { output: extractTextFromHTML(content), title, metadata: {} }
144144
}
145145
return { output: content, title, metadata: {} }
146146

@@ -155,35 +155,27 @@ export const WebFetchTool = Tool.define(
155155
}),
156156
)
157157

158-
async function extractTextFromHTML(html: string) {
158+
function extractTextFromHTML(html: string) {
159159
let text = ""
160-
let skipContent = false
161-
162-
const rewriter = new HTMLRewriter()
163-
.on("script, style, noscript, iframe, object, embed", {
164-
element() {
165-
skipContent = true
166-
},
167-
text() {
168-
// Skip text content inside these elements
169-
},
170-
})
171-
.on("*", {
172-
element(element) {
173-
// Reset skip flag when entering other elements
174-
if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
175-
skipContent = false
176-
}
177-
},
178-
text(input) {
179-
if (!skipContent) {
180-
text += input.text
181-
}
182-
},
183-
})
184-
.transform(new Response(html))
160+
let skipDepth = 0
161+
162+
const parser = new Parser({
163+
onopentag(name) {
164+
if (skipDepth > 0 || ["script", "style", "noscript", "iframe", "object", "embed"].includes(name)) {
165+
skipDepth++
166+
}
167+
},
168+
ontext(input) {
169+
if (skipDepth === 0) text += input
170+
},
171+
onclosetag() {
172+
if (skipDepth > 0) skipDepth--
173+
},
174+
})
175+
176+
parser.write(html)
177+
parser.end()
185178

186-
await rewriter.text()
187179
return text.trim()
188180
}
189181

packages/opencode/test/tool/webfetch.test.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,23 @@ describe("tool.webfetch", () => {
9191
}),
9292
),
9393
)
94+
95+
it.instance("extracts text from html without scripts or styles", () =>
96+
withFetch(
97+
() =>
98+
new Response(
99+
"<html><head><style>.hidden{}</style><script>alert('x')</script></head><body>Hello <b>world</b></body></html>",
100+
{
101+
status: 200,
102+
headers: { "content-type": "text/html; charset=utf-8" },
103+
},
104+
),
105+
(url) =>
106+
Effect.gen(function* () {
107+
const result = yield* exec({ url: new URL("/page.html", url).toString(), format: "text" })
108+
expect(result.output).toBe("Hello world")
109+
expect(result.attachments).toBeUndefined()
110+
}),
111+
),
112+
)
94113
})

0 commit comments

Comments
 (0)