|
14 | 14 | * CREATE_REPORT - Whether to create an issue report (default: false) |
15 | 15 | * REPORT_REPOSITORY - Repository to create report issues in |
16 | 16 | * CACHE_MAX_AGE_DAYS - How long to cache URL check results (default: 7) |
| 17 | + * DOMAIN_CONCURRENCY - Number of domains to process concurrently (default: 10) |
17 | 18 | */ |
18 | 19 |
|
19 | 20 | import { program } from 'commander' |
@@ -42,6 +43,7 @@ const CACHE_MAX_AGE_MS = CACHE_MAX_AGE_DAYS * 24 * 60 * 60 * 1000 |
42 | 43 | // Request configuration |
43 | 44 | const REQUEST_TIMEOUT_MS = 30000 // 30 seconds |
44 | 45 | const REQUEST_DELAY_MS = 100 // 100ms between requests to avoid rate limiting |
| 46 | +const DEFAULT_DOMAIN_CONCURRENCY = 10 // Process this many domains in parallel |
45 | 47 |
|
46 | 48 | // Create a set for fast lookups of excluded links |
47 | 49 | const excludedLinksSet = new Set(excludedLinks.map(({ is }) => is).filter(Boolean)) |
@@ -241,6 +243,11 @@ async function main() { |
241 | 243 | .name('check-links-external') |
242 | 244 | .description('External link checker with caching') |
243 | 245 | .option('--max <number>', 'Maximum number of URLs to check', parseInt) |
| 246 | + .option( |
| 247 | + '--domain-concurrency <number>', |
| 248 | + 'Number of domains to process concurrently', |
| 249 | + String(DEFAULT_DOMAIN_CONCURRENCY), |
| 250 | + ) |
244 | 251 | .option('--verbose', 'Verbose output') |
245 | 252 | .option('--dry-run', "Extract links but don't check them") |
246 | 253 | .parse() |
@@ -308,55 +315,107 @@ async function main() { |
308 | 315 |
|
309 | 316 | const urls = Array.from(allLinks.keys()) |
310 | 317 | const maxUrls = options.max ? Math.min(options.max, urls.length) : urls.length |
| 318 | + const domainConcurrency = Math.max( |
| 319 | + 1, |
| 320 | + parseInt(process.env.DOMAIN_CONCURRENCY || options.domainConcurrency, 10), |
| 321 | + ) |
| 322 | + let malformedCount = 0 |
311 | 323 |
|
312 | | - console.log(`Checking ${maxUrls} URLs (may take a while)...`) |
313 | | - |
| 324 | + // Group URLs by hostname so we can check multiple domains in parallel |
| 325 | + // while keeping requests to any single domain sequential. |
| 326 | + const urlsByDomain = new Map<string, string[]>() |
314 | 327 | for (let i = 0; i < maxUrls; i++) { |
315 | 328 | const url = urls[i] |
316 | | - const occurrences = allLinks.get(url)! |
317 | | - |
318 | | - const result = await checkUrl(url, db.data) |
319 | | - checkedCount++ |
320 | | - |
321 | | - if (result.cached) { |
322 | | - cachedCount++ |
323 | | - } |
324 | | - |
325 | | - if (!result.ok) { |
| 329 | + try { |
| 330 | + const hostname = new URL(url).hostname |
| 331 | + if (!urlsByDomain.has(hostname)) urlsByDomain.set(hostname, []) |
| 332 | + urlsByDomain.get(hostname)!.push(url) |
| 333 | + } catch { |
| 334 | + // Record malformed URLs as broken links |
| 335 | + malformedCount++ |
| 336 | + checkedCount++ |
| 337 | + const occurrences = allLinks.get(url)! |
326 | 338 | for (const occ of occurrences) { |
327 | 339 | brokenLinks.push({ |
328 | 340 | href: occ.href, |
329 | 341 | file: occ.file, |
330 | 342 | lines: [occ.line], |
331 | | - statusCode: result.statusCode, |
332 | | - errorMessage: result.error, |
| 343 | + errorMessage: 'Malformed URL', |
333 | 344 | }) |
334 | 345 | } |
| 346 | + } |
| 347 | + } |
| 348 | + const queuedUrlCount = Array.from(urlsByDomain.values()).reduce( |
| 349 | + (count, domainUrls) => count + domainUrls.length, |
| 350 | + 0, |
| 351 | + ) |
| 352 | + const plannedTotal = queuedUrlCount + malformedCount |
335 | 353 |
|
336 | | - if (options.verbose) { |
337 | | - console.log(` ❌ ${url} - ${result.error || `HTTP ${result.statusCode}`}`) |
| 354 | + console.log( |
| 355 | + `Checking ${plannedTotal} URLs across ${urlsByDomain.size} domains (up to ${domainConcurrency} domains at once)...`, |
| 356 | + ) |
| 357 | + |
| 358 | + // Check all URLs for one domain sequentially, respecting the per-request delay. |
| 359 | + async function checkDomainUrls(domainUrls: string[]): Promise<void> { |
| 360 | + for (const url of domainUrls) { |
| 361 | + const occurrences = allLinks.get(url)! |
| 362 | + const result = await checkUrl(url, db.data) |
| 363 | + checkedCount++ |
| 364 | + |
| 365 | + if (result.cached) cachedCount++ |
| 366 | + |
| 367 | + if (!result.ok) { |
| 368 | + for (const occ of occurrences) { |
| 369 | + brokenLinks.push({ |
| 370 | + href: url, |
| 371 | + file: occ.file, |
| 372 | + lines: [occ.line], |
| 373 | + statusCode: result.statusCode, |
| 374 | + errorMessage: result.error, |
| 375 | + }) |
| 376 | + } |
| 377 | + if (options.verbose) { |
| 378 | + console.log(` ❌ ${url} - ${result.error || `HTTP ${result.statusCode}`}`) |
| 379 | + } |
| 380 | + } else if (options.verbose && !result.cached) { |
| 381 | + console.log(` ✅ ${url}`) |
| 382 | + } |
| 383 | + |
| 384 | + // Progress update every 100 URLs |
| 385 | + if (checkedCount % 100 === 0) { |
| 386 | + const elapsed = ((Date.now() - startTime) / 1000).toFixed(0) |
| 387 | + const rate = (checkedCount / (Date.now() - startTime)) * 1000 * 60 |
| 388 | + const remaining = plannedTotal - checkedCount |
| 389 | + const etaMin = (remaining / rate).toFixed(0) |
| 390 | + console.log( |
| 391 | + ` Checked ${checkedCount}/${plannedTotal} URLs (${cachedCount} cached) — ${elapsed}s elapsed, ~${etaMin}m remaining`, |
| 392 | + ) |
338 | 393 | } |
339 | | - } else if (options.verbose && !result.cached) { |
340 | | - console.log(` ✅ ${url}`) |
341 | | - } |
342 | 394 |
|
343 | | - // Progress update every 100 URLs |
344 | | - if (checkedCount % 100 === 0) { |
345 | | - const elapsed = ((Date.now() - startTime) / 1000).toFixed(0) |
346 | | - const rate = (checkedCount / (Date.now() - startTime)) * 1000 * 60 |
347 | | - const remaining = maxUrls - checkedCount |
348 | | - const etaMin = (remaining / rate).toFixed(0) |
349 | | - console.log( |
350 | | - ` Checked ${checkedCount}/${maxUrls} URLs (${cachedCount} cached) — ${elapsed}s elapsed, ~${etaMin}m remaining`, |
351 | | - ) |
| 395 | + // Small delay between requests to the same domain to avoid rate limiting |
| 396 | + if (!result.cached) { |
| 397 | + await sleep(REQUEST_DELAY_MS) |
| 398 | + } |
352 | 399 | } |
| 400 | + } |
353 | 401 |
|
354 | | - // Small delay between non-cached requests to avoid rate limiting |
355 | | - if (!result.cached) { |
356 | | - await sleep(REQUEST_DELAY_MS) |
| 402 | + // Distribute domains round-robin across DOMAIN_CONCURRENCY workers. Each worker |
| 403 | + // processes its assigned domains sequentially, so we get parallelism across |
| 404 | + // domains without hammering any single domain. |
| 405 | + const domainQueues = Array.from(urlsByDomain.values()) |
| 406 | + const workers: string[][][] = Array.from({ length: domainConcurrency }, () => []) |
| 407 | + for (let i = 0; i < domainQueues.length; i++) { |
| 408 | + workers[i % domainConcurrency].push(domainQueues[i]) |
| 409 | + } |
| 410 | + |
| 411 | + async function runWorker(workerDomains: string[][]): Promise<void> { |
| 412 | + for (const domainUrls of workerDomains) { |
| 413 | + await checkDomainUrls(domainUrls) |
357 | 414 | } |
358 | 415 | } |
359 | 416 |
|
| 417 | + await Promise.all(workers.map(runWorker)) |
| 418 | + |
360 | 419 | // Save cache |
361 | 420 | await db.write() |
362 | 421 |
|
|
0 commit comments