Skip to content

Commit c7af17b

Browse files
xyspgchenxin-yan
andauthored
feat(scraper): add courses scraping (#85)
Co-authored-by: Chenxin Yan <yanchenxin2004@gmail.com>
1 parent 59b54a0 commit c7af17b

10 files changed

Lines changed: 432 additions & 78 deletions

File tree

apps/scraper/src/index.ts

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ import { discoverCourses, scrapeCourse } from "./modules/courses";
1313
import { discoverPrograms, scrapeProgram } from "./modules/programs";
1414

1515
const app = new Hono<{ Bindings: CloudflareBindings }>();
16+
const COURSE_SCRAPE_DELAY_MS = 250;
17+
18+
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
1619

1720
const validateApiKey = async (
1821
c: Context<{ Bindings: CloudflareBindings }>,
@@ -210,19 +213,31 @@ export default {
210213
}
211214
case "discover-courses": {
212215
const courseUrls = await discoverCourses(job.url);
213-
const newJobs = await db
214-
.insert(jobs)
215-
.values(
216-
courseUrls.map((url) => ({
217-
url,
218-
jobType: "course" as const,
219-
})),
220-
)
221-
.returning();
216+
// NOTE: Cloudflare Queues has a limit of 100 messages per sendBatch()
217+
console.log(`Discovered ${courseUrls.length} course URLs`);
218+
219+
const BATCH_SIZE = 10;
220+
for (let i = 0; i < courseUrls.length; i += BATCH_SIZE) {
221+
const batch = courseUrls.slice(i, i + BATCH_SIZE);
222+
223+
const newJobs = await db
224+
.insert(jobs)
225+
.values(
226+
batch.map((url) => ({
227+
url,
228+
jobType: "course" as const,
229+
})),
230+
)
231+
.returning();
232+
233+
await env.SCRAPING_QUEUE.sendBatch(
234+
newJobs.map((j) => ({ body: { jobId: j.id } })),
235+
);
222236

223-
await env.SCRAPING_QUEUE.sendBatch(
224-
newJobs.map((j) => ({ body: { jobId: j.id } })),
225-
);
237+
console.log(
238+
`Queued batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(courseUrls.length / BATCH_SIZE)} (${newJobs.length} jobs)`,
239+
);
240+
}
226241
break;
227242
}
228243
case "program": {
@@ -242,18 +257,28 @@ export default {
242257
break;
243258
}
244259
case "course": {
245-
const res = await scrapeCourse(job.url, db, env);
260+
// A single URL may contain multiple courses
261+
if (COURSE_SCRAPE_DELAY_MS > 0) {
262+
await sleep(COURSE_SCRAPE_DELAY_MS);
263+
}
264+
const courses = await scrapeCourse(job.url, db, env);
246265

247-
const courseId = await convex.upsertCourseWithPrerequisites({
248-
...res.course,
249-
prerequisites: res.prerequisites,
250-
});
266+
console.log(
267+
`Scraped ${courses.length} courses from ${job.url}`,
268+
);
251269

252-
if (!courseId) {
253-
throw new JobError(
254-
"Failed to upsert course: no ID returned",
255-
"validation",
256-
);
270+
for (const courseData of courses) {
271+
const courseId = await convex.upsertCourseWithPrerequisites({
272+
...courseData.course,
273+
prerequisites: courseData.prerequisites,
274+
});
275+
276+
if (!courseId) {
277+
throw new JobError(
278+
`Failed to upsert course ${courseData.course.code}: no ID returned`,
279+
"validation",
280+
);
281+
}
257282
}
258283
break;
259284
}

apps/scraper/src/modules/courses/index.test.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,21 @@ describe("Courses Scraper", () => {
3333
const mockDb = createMockDb();
3434
const mockEnv = createMockEnv();
3535

36-
const result = await scrapeCourse(
36+
const courses = await scrapeCourse(
3737
"https://bulletins.nyu.edu/courses/acct_gb/",
3838
mockDb,
3939
mockEnv,
4040
);
4141

42-
expect(result).toHaveProperty("course");
43-
expect(result).toHaveProperty("prerequisites");
44-
expect(typeof result.course).toBe("object");
45-
expect(Array.isArray(result.prerequisites)).toBe(true);
42+
expect(Array.isArray(courses)).toBe(true);
43+
expect(courses.length).toBeGreaterThan(0);
44+
45+
for (const result of courses) {
46+
expect(result).toHaveProperty("course");
47+
expect(result).toHaveProperty("prerequisites");
48+
expect(typeof result.course).toBe("object");
49+
expect(Array.isArray(result.prerequisites)).toBe(true);
50+
}
4651
});
4752

4853
test("should handle invalid course URLs", async () => {

0 commit comments

Comments
 (0)