Skip to content

Commit 543602e

Browse files
authored
Redo how links are extracted to maintain line numbers (#60818)
1 parent 9bcd3a7 commit 543602e

3 files changed

Lines changed: 283 additions & 26 deletions

File tree

src/links/lib/extract-links.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ const IMAGE_LINK_PATTERN = /!\[[^\]]*\]\(([^)]+)\)/g
2424
// Anchor link patterns (for same-page links)
2525
const ANCHOR_LINK_PATTERN = /\]\(#[^)]+\)/g
2626

27+
// Reference-style link definitions: [id]: /path or [id]: /path "title"
28+
// Captures the URL from lines like: [ssh-agent-forwarding]: /authentication/...
29+
const LINK_DEFINITION_PATTERN = /^\[[^\]]+\]:\s+(\/[^\s"'(<>]*)/gm
30+
31+
// Links whose href starts with a Liquid tag rather than a literal '/'
32+
// e.g. ]({% ifversion fpt %}/enterprise-cloud@latest{% endif %}/path)
33+
// None of these Liquid tags contain ')' in practice, so [^)]+ is safe.
34+
const LIQUID_HREF_PATTERN = /\]\(({%[^)]+)\)/g
35+
2736
export interface ExtractedLink {
2837
href: string
2938
line: number
@@ -39,6 +48,12 @@ export interface LinkExtractionResult {
3948
externalLinks: ExtractedLink[]
4049
anchorLinks: ExtractedLink[]
4150
imageLinks: ExtractedLink[]
51+
/**
52+
* Links whose href begins with a Liquid tag (e.g. `]({% ifversion ... %}/path)`).
53+
* The `href` field contains the raw unrendered Liquid string. Callers that need
54+
* to validate these links must render the href to obtain its canonical path.
55+
*/
56+
liquidPrefixedLinks: ExtractedLink[]
4257
}
4358

4459
/**
@@ -83,6 +98,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
8398
const externalLinks: ExtractedLink[] = []
8499
const anchorLinks: ExtractedLink[] = []
85100
const imageLinks: ExtractedLink[] = []
101+
const liquidPrefixedLinks: ExtractedLink[] = []
86102

87103
// Strip fenced code blocks to avoid checking example/placeholder URLs
88104
// Replaces non-newline characters with spaces to preserve line numbers and positions
@@ -189,11 +205,41 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
189205
// Reset regex
190206
IMAGE_LINK_PATTERN.lastIndex = 0
191207

208+
// Extract reference-style link definitions ([id]: /path)
209+
// These are distinct from inline links but point to the same targets that need validating.
210+
while ((match = LINK_DEFINITION_PATTERN.exec(strippedContent)) !== null) {
211+
const { line, column } = getLineAndColumn(strippedContent, match.index)
212+
const href = match[1].split('#')[0]
213+
internalLinks.push({
214+
href,
215+
line,
216+
column,
217+
isAutotitle: false,
218+
})
219+
}
220+
221+
// Reset regex
222+
LINK_DEFINITION_PATTERN.lastIndex = 0
223+
224+
// Extract links whose href starts with a Liquid tag
225+
while ((match = LIQUID_HREF_PATTERN.exec(strippedContent)) !== null) {
226+
const { line, column } = getLineAndColumn(strippedContent, match.index)
227+
liquidPrefixedLinks.push({
228+
href: match[1],
229+
line,
230+
column,
231+
})
232+
}
233+
234+
// Reset regex
235+
LIQUID_HREF_PATTERN.lastIndex = 0
236+
192237
return {
193238
internalLinks,
194239
externalLinks,
195240
anchorLinks,
196241
imageLinks,
242+
liquidPrefixedLinks,
197243
}
198244
}
199245

src/links/scripts/check-links-internal.ts

Lines changed: 143 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
* CHECK_ANCHORS - Whether to check anchor links (default: true)
1919
*/
2020

21+
import fs from 'fs'
22+
2123
import { program } from 'commander'
2224
import chalk from 'chalk'
2325
import { load } from 'cheerio'
@@ -31,6 +33,8 @@ import {
3133
checkInternalLink,
3234
checkAssetLink,
3335
isAssetLink,
36+
extractLinksWithLiquid,
37+
extractLinksFromMarkdown,
3438
} from '@/links/lib/extract-links'
3539
import {
3640
type BrokenLink,
@@ -62,34 +66,134 @@ interface CheckResult {
6266
}
6367

6468
/**
65-
* Render a page and extract all internal links from the HTML
69+
* Count how many lines the frontmatter block occupies in the raw source file.
70+
* `page.markdown` has frontmatter stripped, so line numbers from markdown
71+
* parsing are relative to the body. Adding this offset converts them to
72+
* actual file line numbers.
73+
*
74+
* Results are cached by fullPath — the file is read once per page across
75+
* both getLinksFromMarkdown() and checkAnchorsOnPage().
6676
*/
67-
async function getLinksFromRenderedPage(
68-
page: Page,
69-
permalink: Permalink,
70-
context: Context,
71-
): Promise<{ href: string; text: string }[]> {
72-
const links: { href: string; text: string }[] = []
77+
const frontmatterLineOffsetCache = new Map<string, number>()
7378

79+
function getFrontmatterLineOffset(fullPath: string): number {
80+
const cached = frontmatterLineOffsetCache.get(fullPath)
81+
if (cached !== undefined) return cached
82+
83+
let offset = 0
7484
try {
75-
// Render the page content
76-
const html = await renderContent(page.markdown, context)
77-
const $ = load(html)
85+
const raw = fs.readFileSync(fullPath, 'utf8')
86+
if (raw.startsWith('---')) {
87+
const lines = raw.split('\n')
88+
for (let i = 1; i < lines.length; i++) {
89+
if (lines[i].trimEnd() === '---') {
90+
// i is the 0-based index of the closing `---`; adding 1 gives the
91+
// 1-based line number of that delimiter, which is the total number
92+
// of frontmatter lines. Body content starts on the next line.
93+
offset = i + 1
94+
break
95+
}
96+
}
97+
}
98+
} catch {
99+
// ignore — fall back to no offset
100+
}
78101

79-
// Extract all anchor links
80-
$('a[href]').each((_, el) => {
81-
const href = $(el).attr('href')
82-
const text = $(el).text()
102+
frontmatterLineOffsetCache.set(fullPath, offset)
103+
return offset
104+
}
83105

84-
if (href && href.startsWith('/')) {
85-
links.push({ href, text })
106+
/**
107+
* Extract all internal links from the markdown source with accurate line numbers.
108+
*
109+
* Links are discovered from the Liquid-rendered content (which expands {% data reusables.xxx %}
110+
* and respects {% ifversion %} for the current version), so coverage matches the original
111+
* HTML-based checker. Line numbers are resolved against the raw markdown source to avoid
112+
* drift caused by Liquid post-processing (blank-line collapsing). Links that originate
113+
* from a reusable file rather than the page itself fall back to line 0.
114+
*/
115+
async function getLinksFromMarkdown(
116+
page: Page,
117+
context: Context,
118+
): Promise<{ href: string; text: string | undefined; line: number }[]> {
119+
const fmOffset = getFrontmatterLineOffset(page.fullPath)
120+
121+
// Build a map of raw-markdown line numbers per href, plus a parallel index
122+
// map to consume them in encounter order without shifting (O(1) per lookup).
123+
//
124+
// When a raw href contains Liquid tags (e.g. `/{% ifversion fpt %}enterprise-cloud@latest/{% endif %}/path`),
125+
// the rendered href will differ from the raw string, so rawLinesByHref.get() would miss.
126+
// To fix this, we lazily import renderLiquid once and use it to resolve those hrefs to
127+
// their canonical (rendered) form before keying the map — matching what extractLinksWithLiquid produces.
128+
const rawResult = extractLinksFromMarkdown(page.markdown)
129+
130+
const needsLiquidHrefResolution =
131+
rawResult.internalLinks.some((l) => l.href.includes('{%') || l.href.includes('{{')) ||
132+
rawResult.liquidPrefixedLinks.length > 0
133+
type RenderLiquidFn = (template: string, context: unknown) => Promise<string>
134+
let renderLiquidFn: RenderLiquidFn | null = null
135+
if (needsLiquidHrefResolution) {
136+
const mod = await import('@/content-render/liquid/index')
137+
renderLiquidFn = mod.renderLiquid
138+
}
139+
140+
const rawLinesByHref = new Map<string, number[]>()
141+
for (const link of rawResult.internalLinks) {
142+
let canonicalHref = link.href
143+
if (renderLiquidFn && (canonicalHref.includes('{%') || canonicalHref.includes('{{'))) {
144+
try {
145+
// Render only the href string so we get the same canonical href that
146+
// extractLinksWithLiquid will produce, without affecting line positions.
147+
canonicalHref = (await renderLiquidFn(canonicalHref, context)).trim()
148+
} catch {
149+
// fall back to raw href if rendering fails
86150
}
87-
})
88-
} catch (error) {
89-
console.warn(`Failed to render ${page.relativePath} (${permalink.href}):`, error)
151+
}
152+
const existing = rawLinesByHref.get(canonicalHref)
153+
if (existing) {
154+
existing.push(link.line + fmOffset)
155+
} else {
156+
rawLinesByHref.set(canonicalHref, [link.line + fmOffset])
157+
}
90158
}
91159

92-
return links
160+
// Liquid-prefixed links (href starts with `{%`) are absent from internalLinks because
161+
// INTERNAL_LINK_PATTERN requires a leading '/'. Render each href to its canonical form
162+
// and, if the result is an internal path, add it to the map so lookups don't miss.
163+
if (renderLiquidFn) {
164+
for (const link of rawResult.liquidPrefixedLinks) {
165+
try {
166+
const rendered = (await renderLiquidFn(link.href, context)).trim().split('#')[0]
167+
if (rendered.startsWith('/')) {
168+
const existing = rawLinesByHref.get(rendered)
169+
if (existing) {
170+
existing.push(link.line + fmOffset)
171+
} else {
172+
rawLinesByHref.set(rendered, [link.line + fmOffset])
173+
}
174+
}
175+
} catch {
176+
// skip — can't resolve line number for this link
177+
}
178+
}
179+
}
180+
// Tracks how many line numbers have been consumed for each href.
181+
const rawLinesIndex = new Map<string, number>()
182+
183+
// The Liquid-rendered set drives which links are actually checked (expands
184+
// reusables, excludes version-gated links that don't apply here).
185+
// extractLinksWithLiquid already catches Liquid render failures internally and
186+
// falls back to raw extraction with a warning, so no outer try/catch is needed.
187+
const renderedResult = await extractLinksWithLiquid(page.markdown, context)
188+
const renderedLinks = renderedResult.internalLinks.map((l) => ({ href: l.href, text: l.text }))
189+
190+
return renderedLinks.map((link) => {
191+
const lines = rawLinesByHref.get(link.href)
192+
const idx = rawLinesIndex.get(link.href) ?? 0
193+
const line = lines && idx < lines.length ? lines[idx] : 0
194+
rawLinesIndex.set(link.href, idx + 1)
195+
return { href: link.href, text: link.text, line }
196+
})
93197
}
94198

95199
/**
@@ -111,6 +215,17 @@ async function checkAnchorsOnPage(
111215
}
112216

113217
try {
218+
// Extract anchor links from markdown first to get accurate line numbers
219+
const mdResult = extractLinksFromMarkdown(page.markdown)
220+
const fmOffset = getFrontmatterLineOffset(page.fullPath)
221+
const anchorLineMap = new Map<string, number>()
222+
for (const link of mdResult.anchorLinks) {
223+
// Store the first occurrence of each anchor href
224+
if (!anchorLineMap.has(link.href)) {
225+
anchorLineMap.set(link.href, link.line + fmOffset)
226+
}
227+
}
228+
114229
const html = await renderContent(page.markdown, context)
115230
const $ = load(html)
116231

@@ -126,10 +241,12 @@ async function checkAnchorsOnPage(
126241
const targetExists = $(`#${escapedId}`).length > 0 || $(`[name="${targetId}"]`).length > 0
127242

128243
if (!targetExists) {
244+
// Look up the line number from the markdown source
245+
const line = anchorLineMap.get(href) ?? 0
129246
brokenAnchors.push({
130247
href,
131248
file: page.relativePath,
132-
lines: [0], // Line number not available from rendered HTML
249+
lines: [line],
133250
text: $(el).text(),
134251
isAutotitle: false,
135252
})
@@ -194,8 +311,8 @@ async function checkVersion(
194311
// awaits before the next begins), so there is no concurrent access to baseContext.
195312
baseContext.page = page
196313

197-
// Get links from rendered page
198-
const links = await getLinksFromRenderedPage(page, permalink, baseContext)
314+
// Get links from markdown source (preserves accurate line numbers)
315+
const links = await getLinksFromMarkdown(page, baseContext)
199316
totalLinksChecked += links.length
200317

201318
// Check each link
@@ -208,7 +325,7 @@ async function checkVersion(
208325
brokenLinks.push({
209326
href: link.href,
210327
file: page.relativePath,
211-
lines: [0],
328+
lines: [link.line],
212329
text: link.text,
213330
})
214331
}
@@ -222,14 +339,14 @@ async function checkVersion(
222339
brokenLinks.push({
223340
href: link.href,
224341
file: page.relativePath,
225-
lines: [0],
342+
lines: [link.line],
226343
text: link.text,
227344
})
228345
} else if (result.isRedirect) {
229346
redirectLinks.push({
230347
href: link.href,
231348
file: page.relativePath,
232-
lines: [0],
349+
lines: [link.line],
233350
text: link.text,
234351
isRedirect: true,
235352
redirectTarget: result.redirectTarget,

0 commit comments

Comments
 (0)