Skip to content

Commit e6eb39e

Browse files
Updating scraper and start script
1 parent d5165d0 commit e6eb39e

2 files changed

Lines changed: 140 additions & 96 deletions

File tree

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
"main": "dist/src/server.js",
77
"scripts": {
88
"build": "tsc",
9-
"start": "NODE_ENV=production node dist/src/server.js",
9+
"start": "node dist/src/server.js",
10+
"start:prod": "NODE_ENV=production node dist/src/server.js",
1011
"start:dev": "NODE_ENV=development node dist/src/server.js",
1112
"dev": "NODE_ENV=development nodemon -x tsx src/server.ts",
1213
"lint": "eslint '{src,test}/**/*.ts' --fix",

prisma/scraper.ts

Lines changed: 138 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,25 @@ import { DEFAULT_IMAGE_URL } from '../src/constants.js';
1111
const prisma = new PrismaClient();
1212

1313
async function getDiningData(): Promise<RawScrapedData> {
14-
const diningData = await fetch(process.env.CORNELL_DINING_API_URL as string);
15-
const data = await diningData.json() as RawScrapedData;
14+
const apiUrl = process.env.CORNELL_DINING_API_URL;
15+
if (!apiUrl) {
16+
throw new Error('CORNELL_DINING_API_URL environment variable is not set');
17+
}
18+
19+
const response = await fetch(apiUrl, {
20+
signal: AbortSignal.timeout(30000), // 30 second timeout
21+
});
22+
23+
if (!response.ok) {
24+
throw new Error(`Failed to fetch dining data: HTTP ${response.status} ${response.statusText}`);
25+
}
26+
27+
const data = await response.json() as RawScrapedData;
28+
29+
if (!data?.data?.eateries) {
30+
throw new Error('Invalid response format from Cornell Dining API');
31+
}
32+
1633
return data;
1734
}
1835

@@ -25,7 +42,7 @@ function loadStaticEateries(): RawStaticEatery[] {
2542
const eateries = Array.isArray(data) ? data : (data.eateries || []);
2643
return eateries;
2744
} catch (error) {
28-
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
45+
if ((error as { code?: string }).code === 'ENOENT') {
2946
console.log('No static eateries file found, skipping...');
3047
return [];
3148
}
@@ -252,44 +269,41 @@ function transformEatery(rawEatery: RawEatery) {
252269
async function transformEateriesConcurrently(
253270
rawEateries: RawEatery[],
254271
concurrency: number = 5
255-
): Promise<Array<{ index: number; result: ReturnType<typeof transformEatery> }>> {
256-
const results: Array<{ index: number; result: ReturnType<typeof transformEatery> }> = [];
257-
const errors: Array<{ index: number; eatery: RawEatery; error: unknown }> = [];
258-
259-
const queue: Array<{ index: number; eatery: RawEatery }> = rawEateries.map((eatery, index) => ({
260-
index,
261-
eatery,
262-
}));
263-
264-
async function worker(workerId: number): Promise<void> {
265-
while (true) {
266-
await new Promise((resolve) => setImmediate(resolve));
267-
268-
const item = queue.shift();
269-
if (!item) break;
270-
271-
const { index, eatery: rawEatery } = item;
272-
273-
try {
274-
const transformed = transformEatery(rawEatery);
275-
results.push({ index, result: transformed });
276-
console.log(` [Worker ${workerId}] ✓ Transformed ${rawEatery.name} (${transformed.events.length} events)`);
277-
} catch (error) {
278-
errors.push({ index, eatery: rawEatery, error });
279-
console.error(` [Worker ${workerId}] ✗ Error transforming ${rawEatery.name}:`, error);
272+
): Promise<ReturnType<typeof transformEatery>[]> {
273+
const results: ReturnType<typeof transformEatery>[] = [];
274+
const errors: Array<{ eatery: RawEatery; error: unknown }> = [];
275+
276+
// Process in batches for better performance
277+
for (let i = 0; i < rawEateries.length; i += concurrency) {
278+
const batch = rawEateries.slice(i, i + concurrency);
279+
const batchResults = await Promise.allSettled(
280+
batch.map(async (rawEatery) => {
281+
try {
282+
const transformed = transformEatery(rawEatery);
283+
console.log(` ✓ Transformed ${rawEatery.name} (${transformed.events.length} events)`);
284+
return transformed;
285+
} catch (error) {
286+
console.error(` ✗ Error transforming ${rawEatery.name}:`, error);
287+
throw error;
288+
}
289+
})
290+
);
291+
292+
for (let j = 0; j < batchResults.length; j++) {
293+
const result = batchResults[j];
294+
if (result.status === 'fulfilled') {
295+
results.push(result.value);
296+
} else {
297+
errors.push({ eatery: batch[j], error: result.reason });
280298
}
281299
}
282300
}
283301

284-
const workers = Array.from({ length: concurrency }, (_, i) => worker(i + 1));
285-
await Promise.all(workers);
286-
287302
if (errors.length > 0) {
288-
const errorMessages = errors.map((e) => `Eatery "${e.eatery.name}" (index ${e.index}): ${e.error}`).join('\n');
303+
const errorMessages = errors.map((e) => `Eatery "${e.eatery.name}": ${e.error}`).join('\n');
289304
throw new Error(`Failed to transform ${errors.length} eatery(ies):\n${errorMessages}`);
290305
}
291306

292-
results.sort((a, b) => a.index - b.index);
293307
return results;
294308
}
295309

@@ -332,33 +346,45 @@ async function processAllEateries(
332346
}>
333347
) {
334348
return await prisma.$transaction(async (tx) => {
349+
// Clear existing data
335350
await tx.event.deleteMany({});
336351
await tx.eatery.deleteMany({});
337352

338-
for (const { eatery, events } of transformedEateries) {
339-
await tx.eatery.create({
340-
data: {
341-
...eatery,
342-
events: {
343-
create: events.map((rawEvent) => ({
344-
type: mapEventType(rawEvent.type),
345-
startTimestamp: rawEvent.startTimestamp,
346-
endTimestamp: rawEvent.endTimestamp,
347-
menu: {
348-
create: rawEvent.menu.map((rawCategory) => ({
349-
name: rawCategory.category,
350-
items: {
351-
create: rawCategory.items.map((rawItem) => ({
352-
name: rawItem.item,
353+
// Process eateries in smaller batches within the transaction
354+
const BATCH_SIZE = 10;
355+
for (let i = 0; i < transformedEateries.length; i += BATCH_SIZE) {
356+
const batch = transformedEateries.slice(i, i + BATCH_SIZE);
357+
358+
await Promise.all(
359+
batch.map(({ eatery, events }) =>
360+
tx.eatery.create({
361+
data: {
362+
...eatery,
363+
events: {
364+
create: events.map((rawEvent) => ({
365+
type: mapEventType(rawEvent.type),
366+
startTimestamp: rawEvent.startTimestamp,
367+
endTimestamp: rawEvent.endTimestamp,
368+
menu: {
369+
create: rawEvent.menu.map((rawCategory) => ({
370+
name: rawCategory.category,
371+
items: {
372+
create: rawCategory.items.map((rawItem) => ({
373+
name: rawItem.item,
374+
})),
375+
},
353376
})),
354377
},
355378
})),
356379
},
357-
})),
358-
},
359-
},
360-
});
380+
},
381+
})
382+
)
383+
);
361384
}
385+
}, {
386+
maxWait: 20000, // Wait up to 20 seconds to start the transaction
387+
timeout: 60000, // Allow the transaction to run for up to 60 seconds
362388
});
363389
}
364390

@@ -383,6 +409,50 @@ async function getAllEateriesData() {
383409
});
384410
}
385411

412+
async function updateServerCache(): Promise<void> {
413+
const serverUrl = process.env.SERVER_URL;
414+
const cacheRefreshHeader = process.env.CACHE_REFRESH_HEADER;
415+
const cacheRefreshSecret = process.env.CACHE_REFRESH_SECRET;
416+
417+
if (!serverUrl || !cacheRefreshHeader || !cacheRefreshSecret) {
418+
console.log('⚠️ Server cache update skipped: Missing SERVER_URL, CACHE_REFRESH_HEADER, or CACHE_REFRESH_SECRET');
419+
return;
420+
}
421+
422+
try {
423+
console.log('Fetching all eatery data for server update...');
424+
const startFetchTime = Date.now();
425+
const allEateryData = await getAllEateriesData();
426+
const fetchDuration = ((Date.now() - startFetchTime) / 1000).toFixed(2);
427+
console.log(`Fetched ${allEateryData.length} eateries in ${fetchDuration}s`);
428+
429+
console.log('Updating server cache with new eatery data...');
430+
const startUpdateTime = Date.now();
431+
const response = await fetch(`${serverUrl}/internal/cache/`, {
432+
method: 'POST',
433+
headers: {
434+
'Content-Type': 'application/json',
435+
[cacheRefreshHeader]: cacheRefreshSecret,
436+
},
437+
body: JSON.stringify({ eateries: allEateryData }),
438+
signal: AbortSignal.timeout(30000), // 30 second timeout
439+
});
440+
441+
if (!response.ok) {
442+
const errorText = await response.text().catch(() => 'Unable to read error response');
443+
throw new Error(
444+
`Server responded with status ${response.status}: ${errorText}`,
445+
);
446+
}
447+
448+
const updateDuration = ((Date.now() - startUpdateTime) / 1000).toFixed(2);
449+
console.log(`✓ Server cache updated successfully in ${updateDuration}s`);
450+
} catch (error) {
451+
console.error('✗ Failed to update server cache with new eatery data:', error);
452+
// Don't throw - this is not critical enough to fail the entire scraper
453+
}
454+
}
455+
386456
export async function main() {
387457
const startTime = Date.now();
388458
console.log('Starting scraper at', new Date(startTime).toString(), '\n');
@@ -440,9 +510,9 @@ export async function main() {
440510
console.log(`Found ${diningData.data.eateries.length} eateries from API (${apiFetchDuration}s)`);
441511

442512
const transformStartTime = Date.now();
443-
console.log(`Transforming API eatery data with ${process.env.WORKERS} concurrent workers...`);
444-
const transformResults = await transformEateriesConcurrently(diningData.data.eateries, parseInt(process.env.WORKERS || '4', 10));
445-
const transformedApiEateries = transformResults.map((r) => r.result);
513+
const workerCount = parseInt(process.env.WORKERS || '4', 10);
514+
console.log(`Transforming API eatery data with ${workerCount} concurrent workers...`);
515+
const transformedApiEateries = await transformEateriesConcurrently(diningData.data.eateries, workerCount);
446516
const transformDuration = ((Date.now() - transformStartTime) / 1000).toFixed(2);
447517
console.log(`✓ Successfully transformed ${transformedApiEateries.length} API eateries (${transformDuration}s)\n`);
448518

@@ -490,46 +560,8 @@ export async function main() {
490560
}
491561

492562
// Send newly populated data to server
493-
console.log('[Scheduler] Scraper run finished\n');
494-
try {
495-
console.log('[Scheduler] Fetching all eatery data for server update...');
496-
const startFetchTime = Date.now();
497-
const allEateryData = await getAllEateriesData();
498-
const fetchDuration = ((Date.now() - startFetchTime) / 1000).toFixed(2);
499-
console.log(
500-
`[Scheduler] Fetched ${allEateryData.length} eateries in ${fetchDuration}s`,
501-
);
502-
503-
console.log('[Scheduler] Updating server cache with new eatery data...');
504-
const startUpdateTime = Date.now();
505-
const response = await fetch(
506-
`${process.env.SERVER_URL}/internal/cache/`,
507-
{
508-
method: 'POST',
509-
headers: {
510-
'Content-Type': 'application/json',
511-
[process.env.CACHE_REFRESH_HEADER!]:
512-
process.env.CACHE_REFRESH_SECRET!,
513-
},
514-
body: JSON.stringify({ eateries: allEateryData }),
515-
},
516-
);
517-
if (!response.ok) {
518-
throw new Error(
519-
`Server responded with status ${response.status} during cache update`,
520-
);
521-
}
522-
const updateDuration = ((Date.now() - startUpdateTime) / 1000).toFixed(2);
523-
console.log(
524-
`[Scheduler] Server cache updated successfully in ${updateDuration}s`,
525-
);
526-
}
527-
catch (error) {
528-
console.error(
529-
'[Scheduler] Failed to update server cache with new eatery data:',
530-
error,
531-
);
532-
}
563+
console.log('\nScraper run finished\n');
564+
await updateServerCache();
533565

534566
const totalDuration = ((Date.now() - startTime) / 1000).toFixed(2);
535567
console.log(`\n✅ Dining data scraped successfully in ${totalDuration}s`);
@@ -584,8 +616,19 @@ function startScraperScheduler() {
584616
}
585617

586618
if (process.env.SCHEDULED_MODE === 'true') {
619+
console.log('[Scheduler] Running initial scraper on startup...');
620+
621+
// Run scraper immediately on startup
622+
runScraperSafely().then(() => {
623+
console.log('[Scheduler] Initial scraper run completed');
624+
}).catch((error) => {
625+
console.error('[Scheduler] Initial scraper run failed:', error);
626+
});
627+
628+
// Start the scheduler for future runs
587629
startScraperScheduler();
588630
console.log('[Scheduler] Scraper scheduler is running. Press Ctrl+C to stop.');
631+
589632
const gracefulShutdown = async () => {
590633
console.log('[Scheduler] Shutting down gracefully...');
591634
await prisma.$disconnect();

0 commit comments

Comments
 (0)