-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontent.js
More file actions
executable file
·112 lines (86 loc) · 2.64 KB
/
content.js
File metadata and controls
executable file
·112 lines (86 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env node
// Content extractor — fetches a URL and converts it to readable markdown
// Uses jsdom + @mozilla/readability + turndown
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
import TurndownService from "turndown";
import { gfm } from "turndown-plugin-gfm";
const FETCH_TIMEOUT = 15_000;
const MAX_HTML_SIZE = 5 * 1024 * 1024; // 5 MB
function usage() {
console.error(`Usage: content.js <url>
Fetches a URL and extracts the main content as readable markdown.
Environment:
No API key needed — fetches pages directly.`);
}
function createTurndown() {
const td = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
bulletListMarker: "-",
});
td.use(gfm);
// Remove script/style/nav elements
td.remove(["script", "style", "nav", "footer", "header"]);
return td;
}
async function extractContent(url) {
const res = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (compatible; OpenClaw/1.0; +https://github.com/openclaw)",
Accept: "text/html,application/xhtml+xml",
},
signal: AbortSignal.timeout(FETCH_TIMEOUT),
redirect: "follow",
});
if (!res.ok) {
throw new Error(`HTTP ${res.status} ${res.statusText}`);
}
const contentType = res.headers.get("content-type") || "";
if (!contentType.includes("text/html") && !contentType.includes("xhtml")) {
throw new Error(`Not an HTML page (content-type: ${contentType})`);
}
const html = await res.text();
if (html.length > MAX_HTML_SIZE) {
throw new Error(
`Page too large (${(html.length / 1024 / 1024).toFixed(1)} MB, max ${MAX_HTML_SIZE / 1024 / 1024} MB)`
);
}
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article || !article.content) {
throw new Error("Could not extract readable content from page");
}
const turndown = createTurndown();
const markdown = turndown.turndown(article.content);
const lines = [`# ${article.title || "Untitled"}`, `> Source: ${url}`, ""];
if (article.byline) {
lines.push(`*By ${article.byline}*`, "");
}
lines.push(markdown);
return lines.join("\n");
}
async function main() {
const url = process.argv[2];
if (!url) {
usage();
process.exit(1);
}
// Basic URL validation
try {
new URL(url);
} catch {
console.error(`Error: Invalid URL: ${url}`);
process.exit(1);
}
try {
const markdown = await extractContent(url);
console.log(markdown);
} catch (err) {
console.error(`Error extracting content from ${url}: ${err.message}`);
process.exit(1);
}
}
main();