diff options
| author | Dax Raad <[email protected]> | 2025-06-19 10:43:54 -0400 |
|---|---|---|
| committer | Dax Raad <[email protected]> | 2025-06-19 10:43:54 -0400 |
| commit | 7ca8334a8b39f19fe04e563189bc07c63253c256 (patch) | |
| tree | 81f0dff4da95df617a12b23cb9894d90cb58f62d | |
| parent | f1a2b2eba4e711fb83966143efd604ab9397eab8 (diff) | |
| download | opencode-7ca8334a8b39f19fe04e563189bc07c63253c256.tar.gz opencode-7ca8334a8b39f19fe04e563189bc07c63253c256.zip | |
fix webfetch tool when returning html as text
| -rw-r--r-- | packages/opencode/src/tool/webfetch.ts | 45 |
1 files changed, 40 insertions, 5 deletions
diff --git a/packages/opencode/src/tool/webfetch.ts b/packages/opencode/src/tool/webfetch.ts index cb501b44b..5b7b9f9d9 100644 --- a/packages/opencode/src/tool/webfetch.ts +++ b/packages/opencode/src/tool/webfetch.ts @@ -76,7 +76,7 @@ export const WebFetchTool = Tool.define({ switch (params.format) { case "text": if (contentType.includes("text/html")) { - const text = extractTextFromHTML(content) + const text = await extractTextFromHTML(content) return { output: text, metadata: { @@ -127,10 +127,45 @@ export const WebFetchTool = Tool.define({ }, }) -function extractTextFromHTML(html: string): string { - const doc = new DOMParser().parseFromString(html, "text/html") - const text = doc.body.textContent || doc.body.innerText || "" - return text.replace(/\s+/g, " ").trim() +async function extractTextFromHTML(html: string) { + let text = "" + let skipContent = false + + const rewriter = new HTMLRewriter() + .on("script, style, noscript, iframe, object, embed", { + element() { + skipContent = true + }, + text() { + // Skip text content inside these elements + }, + }) + .on("*", { + element(element) { + // Reset skip flag when entering other elements + if ( + ![ + "script", + "style", + "noscript", + "iframe", + "object", + "embed", + ].includes(element.tagName) + ) { + skipContent = false + } + }, + text(input) { + if (!skipContent) { + text += input.text + } + }, + }) + .transform(new Response(html)) + + await rewriter.text() + return text.trim() } function convertHTMLToMarkdown(html: string): string { |
