-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathtestBrowserExtraction.ts
More file actions
106 lines (86 loc) · 3.09 KB
/
testBrowserExtraction.ts
File metadata and controls
106 lines (86 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
import { chromium } from "playwright";
import { extract, ContentFormat } from "../index";
import { z } from "zod";
import * as path from "path";
import { config } from "dotenv";
config({ path: path.resolve(process.cwd(), ".env") });
const productCatalogSchema = z.object({
products: z
.array(
z.object({
name: z.string().describe("Product name or title"),
brand: z.string().optional().describe("Brand name"),
price: z.number().describe("Current price"),
originalPrice: z
.number()
.optional()
.describe("Original price if on sale"),
rating: z.number().optional().describe("Product rating out of 5"),
reviewCount: z.number().optional().describe("Number of reviews"),
productUrl: z.string().url().describe("Link to product detail page"),
imageUrl: z.string().url().optional().describe("Product image URL"),
})
)
.describe("List of bread and bakery products"),
});
async function testProductCatalogExtraction() {
console.log("Testing Product Catalog Extraction...\n");
const testUrl =
"https://www.walmart.ca/en/browse/grocery/bread-bakery/10019_6000194327359";
try {
console.log(`Loading product catalog page: ${testUrl}`);
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await page.goto(testUrl);
try {
await page.waitForLoadState("networkidle", { timeout: 10000 });
} catch {
console.log("Network idle timeout, continuing...");
}
const html = await page.content();
console.log(`Loaded ${html.length} characters of HTML`);
await browser.close();
console.log("Browser closed");
console.log("\nExtracting product data using LLM...");
const result = await extract({
llm: new ChatGoogleGenerativeAI({
apiKey: process.env.GOOGLE_API_KEY,
model: "gemini-2.5-flash",
temperature: 0,
}),
content: html,
format: ContentFormat.HTML,
sourceUrl: testUrl,
schema: productCatalogSchema,
htmlExtractionOptions: {
extractMainHtml: true,
includeImages: true,
cleanUrls: true,
},
});
console.log("Extraction successful!");
console.log("EXTRACTED PRODUCT CATALOG DATA:");
console.log("=".repeat(80));
console.log(JSON.stringify(result.data, null, 2));
console.log("=".repeat(80));
console.log("\nToken Usage:");
console.log(`Input tokens: ${result.usage.inputTokens}`);
console.log(`Output tokens: ${result.usage.outputTokens}`);
} catch (error) {
console.error("Error during product catalog extraction:", error);
}
}
async function main() {
if (!process.env.GOOGLE_API_KEY) {
console.error("Please set GOOGLE_API_KEY environment variable");
process.exit(1);
}
console.log("Starting product catalog extraction\n");
await testProductCatalogExtraction();
console.log("\nExtraction completed!");
}
if (require.main === module) {
main().catch(console.error);
}
export { testProductCatalogExtraction };