Skip to content

Commit 3123f35

Browse files
stevesCopilot
andauthored
Parallelize external link checks per domain (#61185)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: steves <54561+steves@users.noreply.github.com>
1 parent 21cae3a commit 3123f35

1 file changed

Lines changed: 90 additions & 31 deletions

File tree

src/links/scripts/check-links-external.ts

Lines changed: 90 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* CREATE_REPORT - Whether to create an issue report (default: false)
1515
* REPORT_REPOSITORY - Repository to create report issues in
1616
* CACHE_MAX_AGE_DAYS - How long to cache URL check results (default: 7)
17+
* DOMAIN_CONCURRENCY - Number of domains to process concurrently (default: 10)
1718
*/
1819

1920
import { program } from 'commander'
@@ -42,6 +43,7 @@ const CACHE_MAX_AGE_MS = CACHE_MAX_AGE_DAYS * 24 * 60 * 60 * 1000
4243
// Request configuration
4344
const REQUEST_TIMEOUT_MS = 30000 // 30 seconds
4445
const REQUEST_DELAY_MS = 100 // 100ms between requests to avoid rate limiting
46+
const DEFAULT_DOMAIN_CONCURRENCY = 10 // Process this many domains in parallel
4547

4648
// Create a set for fast lookups of excluded links
4749
const excludedLinksSet = new Set(excludedLinks.map(({ is }) => is).filter(Boolean))
@@ -241,6 +243,11 @@ async function main() {
241243
.name('check-links-external')
242244
.description('External link checker with caching')
243245
.option('--max <number>', 'Maximum number of URLs to check', parseInt)
246+
.option(
247+
'--domain-concurrency <number>',
248+
'Number of domains to process concurrently',
249+
String(DEFAULT_DOMAIN_CONCURRENCY),
250+
)
244251
.option('--verbose', 'Verbose output')
245252
.option('--dry-run', "Extract links but don't check them")
246253
.parse()
@@ -308,55 +315,107 @@ async function main() {
308315

309316
const urls = Array.from(allLinks.keys())
310317
const maxUrls = options.max ? Math.min(options.max, urls.length) : urls.length
318+
const domainConcurrency = Math.max(
319+
1,
320+
parseInt(process.env.DOMAIN_CONCURRENCY || options.domainConcurrency, 10),
321+
)
322+
let malformedCount = 0
311323

312-
console.log(`Checking ${maxUrls} URLs (may take a while)...`)
313-
324+
// Group URLs by hostname so we can check multiple domains in parallel
325+
// while keeping requests to any single domain sequential.
326+
const urlsByDomain = new Map<string, string[]>()
314327
for (let i = 0; i < maxUrls; i++) {
315328
const url = urls[i]
316-
const occurrences = allLinks.get(url)!
317-
318-
const result = await checkUrl(url, db.data)
319-
checkedCount++
320-
321-
if (result.cached) {
322-
cachedCount++
323-
}
324-
325-
if (!result.ok) {
329+
try {
330+
const hostname = new URL(url).hostname
331+
if (!urlsByDomain.has(hostname)) urlsByDomain.set(hostname, [])
332+
urlsByDomain.get(hostname)!.push(url)
333+
} catch {
334+
// Record malformed URLs as broken links
335+
malformedCount++
336+
checkedCount++
337+
const occurrences = allLinks.get(url)!
326338
for (const occ of occurrences) {
327339
brokenLinks.push({
328340
href: occ.href,
329341
file: occ.file,
330342
lines: [occ.line],
331-
statusCode: result.statusCode,
332-
errorMessage: result.error,
343+
errorMessage: 'Malformed URL',
333344
})
334345
}
346+
}
347+
}
348+
const queuedUrlCount = Array.from(urlsByDomain.values()).reduce(
349+
(count, domainUrls) => count + domainUrls.length,
350+
0,
351+
)
352+
const plannedTotal = queuedUrlCount + malformedCount
335353

336-
if (options.verbose) {
337-
console.log(` ❌ ${url} - ${result.error || `HTTP ${result.statusCode}`}`)
354+
console.log(
355+
`Checking ${plannedTotal} URLs across ${urlsByDomain.size} domains (up to ${domainConcurrency} domains at once)...`,
356+
)
357+
358+
// Check all URLs for one domain sequentially, respecting the per-request delay.
359+
async function checkDomainUrls(domainUrls: string[]): Promise<void> {
360+
for (const url of domainUrls) {
361+
const occurrences = allLinks.get(url)!
362+
const result = await checkUrl(url, db.data)
363+
checkedCount++
364+
365+
if (result.cached) cachedCount++
366+
367+
if (!result.ok) {
368+
for (const occ of occurrences) {
369+
brokenLinks.push({
370+
href: url,
371+
file: occ.file,
372+
lines: [occ.line],
373+
statusCode: result.statusCode,
374+
errorMessage: result.error,
375+
})
376+
}
377+
if (options.verbose) {
378+
console.log(` ❌ ${url} - ${result.error || `HTTP ${result.statusCode}`}`)
379+
}
380+
} else if (options.verbose && !result.cached) {
381+
console.log(` ✅ ${url}`)
382+
}
383+
384+
// Progress update every 100 URLs
385+
if (checkedCount % 100 === 0) {
386+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0)
387+
const rate = (checkedCount / (Date.now() - startTime)) * 1000 * 60
388+
const remaining = plannedTotal - checkedCount
389+
const etaMin = (remaining / rate).toFixed(0)
390+
console.log(
391+
` Checked ${checkedCount}/${plannedTotal} URLs (${cachedCount} cached) — ${elapsed}s elapsed, ~${etaMin}m remaining`,
392+
)
338393
}
339-
} else if (options.verbose && !result.cached) {
340-
console.log(` ✅ ${url}`)
341-
}
342394

343-
// Progress update every 100 URLs
344-
if (checkedCount % 100 === 0) {
345-
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0)
346-
const rate = (checkedCount / (Date.now() - startTime)) * 1000 * 60
347-
const remaining = maxUrls - checkedCount
348-
const etaMin = (remaining / rate).toFixed(0)
349-
console.log(
350-
` Checked ${checkedCount}/${maxUrls} URLs (${cachedCount} cached) — ${elapsed}s elapsed, ~${etaMin}m remaining`,
351-
)
395+
// Small delay between requests to the same domain to avoid rate limiting
396+
if (!result.cached) {
397+
await sleep(REQUEST_DELAY_MS)
398+
}
352399
}
400+
}
353401

354-
// Small delay between non-cached requests to avoid rate limiting
355-
if (!result.cached) {
356-
await sleep(REQUEST_DELAY_MS)
402+
// Distribute domains round-robin across DOMAIN_CONCURRENCY workers. Each worker
403+
// processes its assigned domains sequentially, so we get parallelism across
404+
// domains without hammering any single domain.
405+
const domainQueues = Array.from(urlsByDomain.values())
406+
const workers: string[][][] = Array.from({ length: domainConcurrency }, () => [])
407+
for (let i = 0; i < domainQueues.length; i++) {
408+
workers[i % domainConcurrency].push(domainQueues[i])
409+
}
410+
411+
async function runWorker(workerDomains: string[][]): Promise<void> {
412+
for (const domainUrls of workerDomains) {
413+
await checkDomainUrls(domainUrls)
357414
}
358415
}
359416

417+
await Promise.all(workers.map(runWorker))
418+
360419
// Save cache
361420
await db.write()
362421

0 commit comments

Comments
 (0)