-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathcrawl.js
More file actions
93 lines (86 loc) · 2.72 KB
/
Copy pathcrawl.js
File metadata and controls
93 lines (86 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import axios from 'axios';
import handleError from './utils/handleError.js';
import { ZodType } from 'zod';
import { zodToJsonSchema } from 'zod-to-json-schema';
/**
* Start a crawl job using the ScrapeGraphAI API.
*
* @param {string} apiKey - Your ScrapeGraph AI API key
* @param {string} url - The starting URL for the crawl
* @param {string} prompt - The prompt to guide the crawl and extraction
* @param {Object|ZodType} schema - JSON schema or Zod schema defining the structure of the extracted data
* @param {Object} [options] - Optional crawl parameters
* @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
* @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
* @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
* @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
* @param {number} [options.batchSize=1] - Batch size for processing pages (1-10)
* @returns {Promise<Object>} The crawl job response
* @throws {Error} Throws an error if the HTTP request fails
*/
export async function crawl(
apiKey,
url,
prompt,
schema,
options = {}
) {
const endpoint = 'https://api.scrapegraphai.com/v1/crawl';
const headers = {
'accept': 'application/json',
'SGAI-APIKEY': apiKey,
'Content-Type': 'application/json',
};
let schemaPayload;
if (schema instanceof ZodType) {
schemaPayload = zodToJsonSchema(schema);
} else if (typeof schema === 'object' && schema !== null) {
schemaPayload = schema;
} else {
throw new Error('The schema must be a Zod schema or a plain object');
}
const {
cacheWebsite = true,
depth = 2,
maxPages = 2,
sameDomainOnly = true,
batchSize = 1,
} = options;
const payload = {
url,
prompt,
schema: schemaPayload,
cache_website: cacheWebsite,
depth,
max_pages: maxPages,
same_domain_only: sameDomainOnly,
batch_size: batchSize,
};
try {
const response = await axios.post(endpoint, payload, { headers });
return response.data;
} catch (error) {
handleError(error);
}
}
/**
* Get the result of a crawl job by ID.
*
* @param {string} apiKey - Your ScrapeGraph AI API key
* @param {string} crawlId - The crawl job ID
* @returns {Promise<Object>} The crawl result
* @throws {Error} Throws an error if the HTTP request fails
*/
export async function getCrawlRequest(apiKey, crawlId) {
const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`;
const headers = {
'accept': 'application/json',
'SGAI-APIKEY': apiKey,
};
try {
const response = await axios.get(endpoint, { headers });
return response.data;
} catch (error) {
handleError(error);
}
}