summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorDax Raad <[email protected]>2025-06-19 10:43:54 -0400
committerDax Raad <[email protected]>2025-06-19 10:43:54 -0400
commit7ca8334a8b39f19fe04e563189bc07c63253c256 (patch)
tree81f0dff4da95df617a12b23cb9894d90cb58f62d
parentf1a2b2eba4e711fb83966143efd604ab9397eab8 (diff)
downloadopencode-7ca8334a8b39f19fe04e563189bc07c63253c256.tar.gz
opencode-7ca8334a8b39f19fe04e563189bc07c63253c256.zip
fix webfetch tool when returning html as text
-rw-r--r--packages/opencode/src/tool/webfetch.ts45
1 files changed, 40 insertions, 5 deletions
diff --git a/packages/opencode/src/tool/webfetch.ts b/packages/opencode/src/tool/webfetch.ts
index cb501b44b..5b7b9f9d9 100644
--- a/packages/opencode/src/tool/webfetch.ts
+++ b/packages/opencode/src/tool/webfetch.ts
@@ -76,7 +76,7 @@ export const WebFetchTool = Tool.define({
switch (params.format) {
case "text":
if (contentType.includes("text/html")) {
- const text = extractTextFromHTML(content)
+ const text = await extractTextFromHTML(content)
return {
output: text,
metadata: {
@@ -127,10 +127,45 @@ export const WebFetchTool = Tool.define({
},
})
-function extractTextFromHTML(html: string): string {
- const doc = new DOMParser().parseFromString(html, "text/html")
- const text = doc.body.textContent || doc.body.innerText || ""
- return text.replace(/\s+/g, " ").trim()
+async function extractTextFromHTML(html: string) {
+ let text = ""
+ let skipContent = false
+
+ const rewriter = new HTMLRewriter()
+ .on("script, style, noscript, iframe, object, embed", {
+ element() {
+ skipContent = true
+ },
+ text() {
+ // Skip text content inside these elements
+ },
+ })
+ .on("*", {
+ element(element) {
+ // Reset skip flag when entering other elements
+ if (
+ ![
+ "script",
+ "style",
+ "noscript",
+ "iframe",
+ "object",
+ "embed",
+ ].includes(element.tagName)
+ ) {
+ skipContent = false
+ }
+ },
+ text(input) {
+ if (!skipContent) {
+ text += input.text
+ }
+ },
+ })
+ .transform(new Response(html))
+
+ await rewriter.text()
+ return text.trim()
}
function convertHTMLToMarkdown(html: string): string {