Skip to content

Commit ce00e42

Browse files
stevesCopilot
andauthored
Improve internal link checker performance (#61171)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 17411cb commit ce00e42

2 files changed

Lines changed: 342 additions & 136 deletions

File tree

src/links/lib/extract-links.ts

Lines changed: 69 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,34 @@ export interface LinkExtractionResult {
5757
}
5858

5959
/**
60-
* Get line and column number for a match in content
60+
* Build an array of character offsets at which each line starts.
61+
* offsets[0] is always 0. Called once per extractLinksFromMarkdown invocation
62+
* so that getLineAndColumn can use binary search instead of repeated splits.
6163
*/
62-
function getLineAndColumn(content: string, matchIndex: number): { line: number; column: number } {
63-
const lines = content.substring(0, matchIndex).split('\n')
64-
const line = lines.length
65-
const column = lines[lines.length - 1].length + 1
66-
return { line, column }
64+
function buildLineOffsets(content: string): number[] {
65+
const offsets = [0]
66+
for (let i = 0; i < content.length; i++) {
67+
if (content[i] === '\n') offsets.push(i + 1)
68+
}
69+
return offsets
70+
}
71+
72+
/**
73+
* Get line and column number for a match using a precomputed line-offset index.
74+
* Binary search gives O(log L) per call instead of O(matchIndex).
75+
*/
76+
function getLineAndColumn(
77+
lineOffsets: number[],
78+
matchIndex: number,
79+
): { line: number; column: number } {
80+
let lo = 0
81+
let hi = lineOffsets.length - 1
82+
while (lo < hi) {
83+
const mid = (lo + hi + 1) >> 1
84+
if (lineOffsets[mid] <= matchIndex) lo = mid
85+
else hi = mid - 1
86+
}
87+
return { line: lo + 1, column: matchIndex - lineOffsets[lo] + 1 }
6788
}
6889

6990
/**
@@ -109,10 +130,13 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
109130
},
110131
)
111132

133+
// Precompute line-start offsets once so every getLineAndColumn call is O(log L).
134+
const lineOffsets = buildLineOffsets(strippedContent)
135+
112136
// Extract AUTOTITLE links first (they're a special case of internal links)
113137
let match
114138
while ((match = AUTOTITLE_LINK_PATTERN.exec(strippedContent)) !== null) {
115-
const { line, column } = getLineAndColumn(strippedContent, match.index)
139+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
116140
const href = match[1].split('#')[0] // Remove anchor if present
117141
if (href.startsWith('/')) {
118142
internalLinks.push({
@@ -136,7 +160,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
136160
continue
137161
}
138162

139-
const { line, column } = getLineAndColumn(strippedContent, match.index)
163+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
140164
// Extract href from ](/path) format
141165
const href = fullMatch.substring(2, fullMatch.length - 1).split('#')[0]
142166
const text = extractLinkText(strippedContent, match.index)
@@ -155,7 +179,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
155179

156180
// Extract external links
157181
while ((match = EXTERNAL_LINK_PATTERN.exec(strippedContent)) !== null) {
158-
const { line, column } = getLineAndColumn(strippedContent, match.index)
182+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
159183
const href = match[1]
160184
const text = extractLinkText(strippedContent, match.index)
161185

@@ -172,7 +196,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
172196

173197
// Extract anchor links
174198
while ((match = ANCHOR_LINK_PATTERN.exec(strippedContent)) !== null) {
175-
const { line, column } = getLineAndColumn(strippedContent, match.index)
199+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
176200
const href = match[0].substring(2, match[0].length - 1)
177201

178202
anchorLinks.push({
@@ -188,7 +212,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
188212

189213
// Extract image links
190214
while ((match = IMAGE_LINK_PATTERN.exec(strippedContent)) !== null) {
191-
const { line, column } = getLineAndColumn(strippedContent, match.index)
215+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
192216
const href = match[1]
193217

194218
// Only include internal images (starting with /)
@@ -208,7 +232,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
208232
// Extract reference-style link definitions ([id]: /path)
209233
// These are distinct from inline links but point to the same targets that need validating.
210234
while ((match = LINK_DEFINITION_PATTERN.exec(strippedContent)) !== null) {
211-
const { line, column } = getLineAndColumn(strippedContent, match.index)
235+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
212236
const href = match[1].split('#')[0]
213237
internalLinks.push({
214238
href,
@@ -223,7 +247,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
223247

224248
// Extract links whose href starts with a Liquid tag
225249
while ((match = LIQUID_HREF_PATTERN.exec(strippedContent)) !== null) {
226-
const { line, column } = getLineAndColumn(strippedContent, match.index)
250+
const { line, column } = getLineAndColumn(lineOffsets, match.index)
227251
liquidPrefixedLinks.push({
228252
href: match[1],
229253
line,
@@ -274,6 +298,18 @@ export function createLiquidContext(
274298
} as Context
275299
}
276300

301+
// Cached reference to renderLiquid — avoids repeated dynamic-import overhead on every call.
302+
// A dynamic import is still used (not a top-level import) to prevent circular dependency issues.
303+
type RenderLiquidModule = (template: string, context: unknown) => Promise<string>
304+
let _renderLiquid: RenderLiquidModule | null = null
305+
async function getCachedRenderLiquid(): Promise<RenderLiquidModule> {
306+
if (!_renderLiquid) {
307+
const mod = await import('@/content-render/liquid/index')
308+
_renderLiquid = mod.renderLiquid
309+
}
310+
return _renderLiquid
311+
}
312+
277313
/**
278314
* Render Liquid templates in content and extract links
279315
*
@@ -285,8 +321,8 @@ export async function extractLinksWithLiquid(
285321
context: Context,
286322
): Promise<LinkExtractionResult> {
287323
try {
288-
// Dynamic import to avoid circular dependency issues
289-
const { renderLiquid } = await import('@/content-render/liquid/index')
324+
// Dynamic import to avoid circular dependency issues (cached after first load)
325+
const renderLiquid = await getCachedRenderLiquid()
290326
// Render Liquid to expand conditionals
291327
const rendered = await renderLiquid(content, context)
292328
return extractLinksFromMarkdown(rendered)
@@ -298,6 +334,24 @@ export async function extractLinksWithLiquid(
298334
}
299335
}
300336

337+
/**
338+
* Render Liquid templates in content, returning both the rendered markdown string and
339+
* extracted links. Use this when both are needed to avoid rendering the same content twice.
340+
*/
341+
export async function renderAndExtractLinks(
342+
content: string,
343+
context: Context,
344+
): Promise<{ renderedMarkdown: string; result: LinkExtractionResult }> {
345+
try {
346+
const renderLiquid = await getCachedRenderLiquid()
347+
const renderedMarkdown = await renderLiquid(content, context)
348+
return { renderedMarkdown, result: extractLinksFromMarkdown(renderedMarkdown) }
349+
} catch (error) {
350+
console.warn('Liquid rendering failed, falling back to raw extraction:', error)
351+
return { renderedMarkdown: content, result: extractLinksFromMarkdown(content) }
352+
}
353+
}
354+
301355
/**
302356
* Read a file and extract links
303357
*/

0 commit comments

Comments
 (0)