@@ -13,6 +13,9 @@ import { discoverCourses, scrapeCourse } from "./modules/courses";
1313import { discoverPrograms , scrapeProgram } from "./modules/programs" ;
1414
1515const app = new Hono < { Bindings : CloudflareBindings } > ( ) ;
16+ const COURSE_SCRAPE_DELAY_MS = 250 ;
17+
18+ const sleep = ( ms : number ) => new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
1619
1720const validateApiKey = async (
1821 c : Context < { Bindings : CloudflareBindings } > ,
@@ -210,19 +213,31 @@ export default {
210213 }
211214 case "discover-courses" : {
212215 const courseUrls = await discoverCourses ( job . url ) ;
213- const newJobs = await db
214- . insert ( jobs )
215- . values (
216- courseUrls . map ( ( url ) => ( {
217- url,
218- jobType : "course" as const ,
219- } ) ) ,
220- )
221- . returning ( ) ;
216+ // NOTE: Cloudflare Queues has a limit of 100 messages per sendBatch()
217+ console . log ( `Discovered ${ courseUrls . length } course URLs` ) ;
218+
219+ const BATCH_SIZE = 10 ;
220+ for ( let i = 0 ; i < courseUrls . length ; i += BATCH_SIZE ) {
221+ const batch = courseUrls . slice ( i , i + BATCH_SIZE ) ;
222+
223+ const newJobs = await db
224+ . insert ( jobs )
225+ . values (
226+ batch . map ( ( url ) => ( {
227+ url,
228+ jobType : "course" as const ,
229+ } ) ) ,
230+ )
231+ . returning ( ) ;
232+
233+ await env . SCRAPING_QUEUE . sendBatch (
234+ newJobs . map ( ( j ) => ( { body : { jobId : j . id } } ) ) ,
235+ ) ;
222236
223- await env . SCRAPING_QUEUE . sendBatch (
224- newJobs . map ( ( j ) => ( { body : { jobId : j . id } } ) ) ,
225- ) ;
237+ console . log (
238+ `Queued batch ${ Math . floor ( i / BATCH_SIZE ) + 1 } /${ Math . ceil ( courseUrls . length / BATCH_SIZE ) } (${ newJobs . length } jobs)` ,
239+ ) ;
240+ }
226241 break ;
227242 }
228243 case "program" : {
@@ -242,18 +257,28 @@ export default {
242257 break ;
243258 }
244259 case "course" : {
245- const res = await scrapeCourse ( job . url , db , env ) ;
260+ // A single URL may contain multiple courses
261+ if ( COURSE_SCRAPE_DELAY_MS > 0 ) {
262+ await sleep ( COURSE_SCRAPE_DELAY_MS ) ;
263+ }
264+ const courses = await scrapeCourse ( job . url , db , env ) ;
246265
247- const courseId = await convex . upsertCourseWithPrerequisites ( {
248- ...res . course ,
249- prerequisites : res . prerequisites ,
250- } ) ;
266+ console . log (
267+ `Scraped ${ courses . length } courses from ${ job . url } ` ,
268+ ) ;
251269
252- if ( ! courseId ) {
253- throw new JobError (
254- "Failed to upsert course: no ID returned" ,
255- "validation" ,
256- ) ;
270+ for ( const courseData of courses ) {
271+ const courseId = await convex . upsertCourseWithPrerequisites ( {
272+ ...courseData . course ,
273+ prerequisites : courseData . prerequisites ,
274+ } ) ;
275+
276+ if ( ! courseId ) {
277+ throw new JobError (
278+ `Failed to upsert course ${ courseData . course . code } : no ID returned` ,
279+ "validation" ,
280+ ) ;
281+ }
257282 }
258283 break ;
259284 }
0 commit comments