Skip to content

Commit af777a8

Browse files
VinciGit00claude
andcommitted
feat: align CLI with scrapegraph-js v2 c5bf757
- Bump scrapegraph-js pin b570a57 → c5bf757 - scrape: support 8 formats (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma-separated -f, add --html-mode, --scrolls, --prompt/--schema for json format - search: add --location-geo-code, --time-range, --format - crawl: add --format flag - README: document new flags and formats Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 27be6f4 commit af777a8

6 files changed

Lines changed: 149 additions & 29 deletions

File tree

README.md

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,13 @@ Search the web and extract structured data from results (replaces `search-scrape
132132
### Usage
133133

134134
```bash
135-
just-scrape search <query> # AI-powered web search
136-
just-scrape search <query> --num-results <n> # Sources to scrape (1-20, default 3)
137-
just-scrape search <query> -p <prompt> # Extraction prompt for results
138-
just-scrape search <query> --schema <json> # Enforce output schema
135+
just-scrape search <query> # AI-powered web search
136+
just-scrape search <query> --num-results <n> # Sources to scrape (1-20, default 3)
137+
just-scrape search <query> -p <prompt> # Extraction prompt for results
138+
just-scrape search <query> --schema <json> # Enforce output schema
139+
just-scrape search <query> --location-geo-code <code> # Geo-target search (e.g. 'us', 'de', 'jp-tk')
140+
just-scrape search <query> --time-range <range> # past_hour | past_24_hours | past_week | past_month | past_year
141+
just-scrape search <query> --format <markdown|html> # Result format (default markdown)
139142
just-scrape search <query> --headers <json>
140143
```
141144

@@ -145,40 +148,53 @@ just-scrape search <query> --headers <json>
145148
# Research a topic across multiple sources
146149
just-scrape search "What are the best Python web frameworks in 2025?" --num-results 10
147150

151+
# Recent news only, scoped to Germany
152+
just-scrape search "EU AI act latest news" --time-range past_week --location-geo-code de
153+
148154
# Structured output with schema
149155
just-scrape search "Top 5 cloud providers pricing" \
150156
--schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}'
151157
```
152158

153159
## Scrape
154160

155-
Scrape content from a URL in various formats: markdown (default), html, screenshot, or branding. [docs](https://docs.scrapegraphai.com/api-reference/scrape)
161+
Scrape content from a URL in one or more formats. The v2 API supports **8 formats**: `markdown`, `html`, `screenshot`, `branding`, `links`, `images`, `summary`, `json`. [docs](https://docs.scrapegraphai.com/api-reference/scrape)
156162

157163
### Usage
158164

159165
```bash
160-
just-scrape scrape <url> # Markdown (default)
161-
just-scrape scrape <url> -f html # Raw HTML
162-
just-scrape scrape <url> -f screenshot # Screenshot
163-
just-scrape scrape <url> -f branding # Extract branding info
164-
just-scrape scrape <url> -m direct+stealth # Anti-bot bypass
165-
just-scrape scrape <url> --country <iso> # Geo-targeting
166+
just-scrape scrape <url> # Markdown (default)
167+
just-scrape scrape <url> -f html # Raw HTML
168+
just-scrape scrape <url> -f screenshot # Page screenshot
169+
just-scrape scrape <url> -f branding # Branding (logos, colors, fonts)
170+
just-scrape scrape <url> -f links # Extracted links
171+
just-scrape scrape <url> -f images # Extracted images
172+
just-scrape scrape <url> -f summary # AI-generated page summary
173+
just-scrape scrape <url> -f json -p <prompt> # Structured JSON via prompt
174+
just-scrape scrape <url> -f markdown,links,images # Multi-format (comma-separated)
175+
just-scrape scrape <url> --html-mode reader # normal (default), reader, or prune
176+
just-scrape scrape <url> --scrolls <n> # Infinite scroll (0-100)
177+
just-scrape scrape <url> -m direct+stealth # Anti-bot bypass
178+
just-scrape scrape <url> --country <iso> # Geo-targeting
166179
```
167180

168181
### Examples
169182

170183
```bash
171-
# Get markdown of a page
184+
# Markdown of a page
172185
just-scrape scrape https://example.com
173186

174-
# Get raw HTML
175-
just-scrape scrape https://example.com -f html
187+
# Raw HTML with reader-mode extraction
188+
just-scrape scrape https://blog.example.com -f html --html-mode reader
189+
190+
# Multi-format: markdown + links + images in a single call
191+
just-scrape scrape https://example.com -f markdown,links,images
192+
193+
# Structured JSON output with a prompt
194+
just-scrape scrape https://store.example.com -f json -p "Extract product name and price"
176195

177196
# Scrape with anti-bot bypass and geo-targeting
178197
just-scrape scrape https://store.example.com -m direct+stealth --country DE
179-
180-
# Extract branding info (logos, colors, fonts)
181-
just-scrape scrape https://example.com -f branding
182198
```
183199

184200
## Markdownify
@@ -218,6 +234,7 @@ just-scrape crawl <url> --max-pages <n> # Max pages (default 50)
218234
just-scrape crawl <url> --max-depth <n> # Crawl depth (default 2)
219235
just-scrape crawl <url> --max-links-per-page <n> # Links per page (default 10)
220236
just-scrape crawl <url> --allow-external # Allow external domains
237+
just-scrape crawl <url> -f html # Page format (default markdown)
221238
just-scrape crawl <url> -m direct+stealth # Anti-bot bypass
222239
```
223240

@@ -283,8 +300,9 @@ Commands have been renamed to match the v2 API:
283300
| `smart-scraper` | `extract` | Renamed |
284301
| `search-scraper` | `search` | Renamed |
285302
| `markdownify` | `markdownify` | Now wraps `scrape --format markdown` |
286-
| `scrape` | `scrape` | Gains `--format` flag (markdown, html, screenshot, branding) |
287-
| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external` |
303+
| `scrape` | `scrape` | Gains `--format` (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma, `--html-mode`, `--scrolls`, `--prompt`, `--schema` |
304+
| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external`, `--format` |
305+
| `search` | `search` | New options: `--location-geo-code`, `--time-range`, `--format` |
288306
| `--stealth` flag | `--mode direct+stealth` | Fetch mode enum replaces boolean (`auto`, `fast`, `js`, `direct+stealth`, `js+stealth`) |
289307
| `agentic-scraper` || Removed from API |
290308
| `generate-schema` || Removed from API |

bun.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"chalk": "^5.4.1",
2929
"citty": "^0.1.6",
3030
"dotenv": "^17.2.4",
31-
"scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#b570a57"
31+
"scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#c5bf757"
3232
},
3333
"devDependencies": {
3434
"@biomejs/biome": "^1.9.4",

src/commands/crawl.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,17 @@ export default defineCommand({
1919
"max-depth": { type: "string", description: "Crawl depth (default 2)" },
2020
"max-links-per-page": { type: "string", description: "Max links per page (default 10)" },
2121
"allow-external": { type: "boolean", description: "Allow crawling external domains" },
22-
mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" },
22+
format: {
23+
type: "string",
24+
alias: "f",
25+
description:
26+
"Page format: markdown (default), html, screenshot, branding, links, images, summary",
27+
},
28+
mode: {
29+
type: "string",
30+
alias: "m",
31+
description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth",
32+
},
2333
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
2434
},
2535
run: async ({ args }) => {
@@ -33,6 +43,7 @@ export default defineCommand({
3343
if (args["max-links-per-page"])
3444
crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]);
3545
if (args["allow-external"]) crawlOptions.allowExternal = true;
46+
if (args.format) crawlOptions.format = args.format;
3647
if (args.mode) crawlOptions.fetchConfig = { mode: args.mode };
3748

3849
out.start("Crawling");
@@ -54,7 +65,11 @@ export default defineCommand({
5465
const statusData = status.data as { status: string; [key: string]: unknown };
5566
out.poll(statusData.status);
5667

57-
if (statusData.status === "completed" || statusData.status === "failed" || statusData.status === "cancelled") {
68+
if (
69+
statusData.status === "completed" ||
70+
statusData.status === "failed" ||
71+
statusData.status === "cancelled"
72+
) {
5873
out.stop(Math.round(performance.now() - t0));
5974
out.result(status.data);
6075
return;

src/commands/scrape.ts

Lines changed: 76 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,23 @@ import { defineCommand } from "citty";
22
import { createClient } from "../lib/client.js";
33
import * as log from "../lib/log.js";
44

5+
const FORMATS = [
6+
"markdown",
7+
"html",
8+
"screenshot",
9+
"branding",
10+
"links",
11+
"images",
12+
"summary",
13+
"json",
14+
] as const;
15+
type Format = (typeof FORMATS)[number];
16+
517
export default defineCommand({
618
meta: {
719
name: "scrape",
8-
description: "Scrape content from a URL (markdown, html, screenshot, or branding)",
20+
description:
21+
"Scrape content from a URL (markdown, html, screenshot, branding, links, images, summary, json)",
922
},
1023
args: {
1124
url: {
@@ -16,9 +29,27 @@ export default defineCommand({
1629
format: {
1730
type: "string",
1831
alias: "f",
19-
description: "Output format: markdown (default), html, screenshot, branding",
32+
description: `Output format: ${FORMATS.join(", ")} (default: markdown). Comma-separate for multi-format output.`,
33+
},
34+
prompt: {
35+
type: "string",
36+
alias: "p",
37+
description: "Prompt for json format (required when --format includes json)",
2038
},
21-
mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" },
39+
schema: {
40+
type: "string",
41+
description: "Schema for json format (JSON string)",
42+
},
43+
mode: {
44+
type: "string",
45+
alias: "m",
46+
description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth",
47+
},
48+
"html-mode": {
49+
type: "string",
50+
description: "HTML/markdown extraction mode: normal (default), reader, prune",
51+
},
52+
scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" },
2253
country: { type: "string", description: "ISO country code for geo-targeting" },
2354
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
2455
},
@@ -29,10 +60,50 @@ export default defineCommand({
2960

3061
const fetchConfig: Record<string, unknown> = {};
3162
if (args.mode) fetchConfig.mode = args.mode;
63+
if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls);
3264
if (args.country) fetchConfig.country = args.country;
3365

34-
const scrapeOptions: Record<string, unknown> = {};
35-
if (args.format) scrapeOptions.format = args.format;
66+
const requestedFormats = (args.format ?? "markdown")
67+
.split(",")
68+
.map((f) => f.trim())
69+
.filter(Boolean) as Format[];
70+
const htmlMode = (args["html-mode"] as "normal" | "reader" | "prune" | undefined) ?? "normal";
71+
72+
const formats = requestedFormats.map((f) => {
73+
switch (f) {
74+
case "markdown":
75+
return { type: "markdown" as const, mode: htmlMode };
76+
case "html":
77+
return { type: "html" as const, mode: htmlMode };
78+
case "screenshot":
79+
return { type: "screenshot" as const };
80+
case "branding":
81+
return { type: "branding" as const };
82+
case "links":
83+
return { type: "links" as const };
84+
case "images":
85+
return { type: "images" as const };
86+
case "summary":
87+
return { type: "summary" as const };
88+
case "json": {
89+
if (!args.prompt) {
90+
out.error("--prompt is required when --format includes json");
91+
return { type: "json" as const };
92+
}
93+
return {
94+
type: "json" as const,
95+
prompt: args.prompt,
96+
schema: args.schema ? JSON.parse(args.schema) : undefined,
97+
mode: htmlMode,
98+
};
99+
}
100+
default:
101+
out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`);
102+
return { type: "markdown" as const, mode: htmlMode };
103+
}
104+
});
105+
106+
const scrapeOptions: Record<string, unknown> = { formats };
36107
if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig;
37108

38109
out.start("Scraping");

src/commands/search.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ export default defineCommand({
2323
description: "Number of websites to scrape (1-20, default 3)",
2424
},
2525
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
26+
"location-geo-code": {
27+
type: "string",
28+
description: "Geo-location code for search (e.g. 'us', 'de', 'jp-tk')",
29+
},
30+
"time-range": {
31+
type: "string",
32+
description:
33+
"Filter results by recency: past_hour, past_24_hours, past_week, past_month, past_year",
34+
},
35+
format: {
36+
type: "string",
37+
description: "Result format: markdown (default) or html",
38+
},
2639
headers: { type: "string", description: "Custom headers as JSON object string" },
2740
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
2841
},
@@ -35,6 +48,9 @@ export default defineCommand({
3548
if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]);
3649
if (args.schema) searchOptions.schema = JSON.parse(args.schema);
3750
if (args.prompt) searchOptions.prompt = args.prompt;
51+
if (args["location-geo-code"]) searchOptions.locationGeoCode = args["location-geo-code"];
52+
if (args["time-range"]) searchOptions.timeRange = args["time-range"];
53+
if (args.format) searchOptions.format = args.format;
3854
if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) };
3955

4056
out.start("Searching");

0 commit comments

Comments
 (0)