-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcrawl.ts
More file actions
59 lines (54 loc) · 2.22 KB
/
crawl.ts
File metadata and controls
59 lines (54 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import { defineCommand } from "citty";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
import * as scrapegraphai from "../lib/scrapegraphai.js";
export default defineCommand({
meta: {
name: "crawl",
description: "Crawl and extract data from multiple pages",
},
args: {
url: {
type: "positional",
description: "Starting URL to crawl",
required: true,
},
prompt: {
type: "string",
alias: "p",
description: "Extraction prompt (required when extraction mode is on)",
},
"no-extraction": {
type: "boolean",
description: "Return markdown only (2 credits/page instead of 10)",
},
"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
depth: { type: "string", description: "Crawl depth (default 1)" },
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
rules: { type: "string", description: "Crawl rules as JSON object string" },
"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
"render-js": { type: "boolean", description: "Enable heavy JS rendering (+1 credit/page)" },
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
const key = await resolveApiKey(!!args.json);
const params: scrapegraphai.CrawlParams = { url: args.url };
if (args.prompt) params.prompt = args.prompt;
if (args["no-extraction"]) params.extraction_mode = false;
if (args["max-pages"]) params.max_pages = Number(args["max-pages"]);
if (args.depth) params.depth = Number(args.depth);
if (args.schema) params.schema = JSON.parse(args.schema);
if (args.rules) params.rules = JSON.parse(args.rules);
if (args["no-sitemap"]) params.sitemap = false;
if (args["render-js"]) params.render_heavy_js = true;
if (args.stealth) params.stealth = true;
out.start("Crawling");
const result = await scrapegraphai.crawl(key, params, out.poll);
out.stop(result.elapsedMs);
if (result.data) out.result(result.data);
else out.error(result.error);
},
});