@@ -37,6 +37,10 @@ static ID_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r#"id\s*=\s*["']([^"']+
3737static NAME_PATTERN : Lazy < Regex > =
3838 Lazy :: new ( || Regex :: new ( r#"name\s*=\s*["']([^"']+)["']"# ) . unwrap ( ) ) ;
3939
40+ // Shared regex pattern for extracting markdown links: [text](url)
41+ static MARKDOWN_LINK_PATTERN : Lazy < Regex > =
42+ Lazy :: new ( || Regex :: new ( r"\[([^\]]*)\]\(([^)]*)\)" ) . unwrap ( ) ) ;
43+
4044pub ( crate ) struct MD051Linter {
4145 context : Rc < Context > ,
4246 valid_fragments : HashSet < String > ,
@@ -104,117 +108,73 @@ impl MD051Linter {
104108 None
105109 }
106110
107- fn extract_link_fragments ( & self , node : & Node ) -> Vec < LinkFragment > {
108- // Extract link fragments using tree-sitter's native link structure
109- // Look for pattern: [ text ] ( URL ) where URL contains #fragment
110- let mut link_fragments = Vec :: new ( ) ;
111-
112- // Traverse child nodes looking for link patterns
113- let mut i = 0 ;
114- while i < node. child_count ( ) {
115- if let Some ( child) = node. child ( i) {
116- if child. kind ( ) == "[" {
117- // Found potential link start, look for complete link pattern
118- if let Some ( ( fragment_info, end_index) ) = self . parse_link_at_position ( node, i) {
119- link_fragments. push ( fragment_info) ;
120- i = end_index;
111+ fn extract_link_fragment_from_link_node ( & self , node : & Node ) -> Option < LinkFragment > {
112+ // Extract fragment from a tree-sitter "link" node
113+ let start_byte = node. start_byte ( ) ;
114+ let end_byte = node. end_byte ( ) ;
115+ let document_content = self . context . document_content . borrow ( ) ;
116+ let link_text = & document_content[ start_byte..end_byte] ;
117+
118+ // Use regex to extract URL from inline link: [text](url)
119+ for cap in MARKDOWN_LINK_PATTERN . captures_iter ( link_text) {
120+ if let Some ( url_match) = cap. get ( 2 ) {
121+ let url = url_match. as_str ( ) ;
122+ // Only process internal fragments (URLs starting with #)
123+ if url. starts_with ( '#' ) {
124+ if let Some ( hash_pos) = url. rfind ( '#' ) {
125+ let fragment = & url[ hash_pos + 1 ..] ;
126+ // Only process non-empty fragments that don't contain spaces
127+ if !fragment. is_empty ( ) && !fragment. contains ( ' ' ) {
128+ return Some ( LinkFragment {
129+ fragment : fragment. to_string ( ) ,
130+ range : node. range ( ) ,
131+ } ) ;
132+ }
121133 }
122134 }
123- i += 1 ;
124135 }
125136 }
126137
127- link_fragments
138+ None
128139 }
129140
130- fn parse_link_at_position (
131- & self ,
132- parent : & Node ,
133- start_idx : usize ,
134- ) -> Option < ( LinkFragment , usize ) > {
135- // Parse link pattern: [ text ] ( URL# fragment )
141+ fn extract_link_fragments_from_inline ( & self , node : & Node ) -> Vec < LinkFragment > {
142+ // Extract fragments from inline text using regex - more robust than tree parsing
143+ let start_byte = node. start_byte ( ) ;
144+ let end_byte = node. end_byte ( ) ;
136145 let document_content = self . context . document_content . borrow ( ) ;
146+ let text = & document_content[ start_byte..end_byte] ;
147+ let mut fragments = Vec :: new ( ) ;
137148
138- // Look for the sequence: [ ... ] ( ... )
139- let mut bracket_close_idx = None ;
140- let mut paren_open_idx = None ;
141- let mut paren_close_idx = None ;
142-
143- // Find ] after [
144- for i in start_idx + 1 ..parent. child_count ( ) {
145- if let Some ( child) = parent. child ( i) {
146- if child. kind ( ) == "]" {
147- bracket_close_idx = Some ( i) ;
148- break ;
149- }
150- }
151- }
152-
153- if let Some ( bracket_close) = bracket_close_idx {
154- // Find ( after ]
155- for i in bracket_close + 1 ..parent. child_count ( ) {
156- if let Some ( child) = parent. child ( i) {
157- if child. kind ( ) == "(" {
158- paren_open_idx = Some ( i) ;
159- break ;
160- }
161- }
162- }
163- }
164-
165- if let Some ( paren_open) = paren_open_idx {
166- // Find ) after (
167- for i in paren_open + 1 ..parent. child_count ( ) {
168- if let Some ( child) = parent. child ( i) {
169- if child. kind ( ) == ")" {
170- paren_close_idx = Some ( i) ;
171- break ;
172- }
173- }
174- }
175- }
176-
177- // If we found a complete link pattern
178- if let ( Some ( paren_open) , Some ( paren_close) ) = ( paren_open_idx, paren_close_idx) {
179- // Extract URL content between ( and ) by getting the text span
180- if let ( Some ( paren_open_node) , Some ( paren_close_node) ) =
181- ( parent. child ( paren_open) , parent. child ( paren_close) )
182- {
183- let start_byte = paren_open_node. end_byte ( ) ; // After the (
184- let end_byte = paren_close_node. start_byte ( ) ; // Before the )
185- let url_parts = & document_content[ start_byte..end_byte] ;
149+ // Use regex to match proper markdown links: [text](url) with no text between ] and (
150+ for cap in MARKDOWN_LINK_PATTERN . captures_iter ( text) {
151+ if let Some ( url_match) = cap. get ( 2 ) {
152+ let url = url_match. as_str ( ) ;
186153 // Only process internal fragments (URLs starting with #)
187- if url_parts . starts_with ( '#' ) {
188- if let Some ( hash_pos) = url_parts . rfind ( '#' ) {
189- let fragment = & url_parts [ hash_pos + 1 ..] ;
154+ if url . starts_with ( '#' ) {
155+ if let Some ( hash_pos) = url . rfind ( '#' ) {
156+ let fragment = & url [ hash_pos + 1 ..] ;
190157 // Only process non-empty fragments that don't contain spaces
191158 if !fragment. is_empty ( ) && !fragment. contains ( ' ' ) {
192- // Get the range of the entire link for position reporting
193- if let ( Some ( start_node) , Some ( end_node) ) =
194- ( parent. child ( start_idx) , parent. child ( paren_close) )
195- {
196- let link_range = tree_sitter:: Range {
197- start_byte : start_node. start_byte ( ) ,
198- end_byte : end_node. end_byte ( ) ,
199- start_point : start_node. range ( ) . start_point ,
200- end_point : end_node. range ( ) . end_point ,
201- } ;
202-
203- return Some ( (
204- LinkFragment {
205- fragment : fragment. to_string ( ) ,
206- range : link_range,
207- } ,
208- paren_close,
209- ) ) ;
210- }
159+ // Calculate the actual byte position of this match in the document
160+ let match_start = start_byte + cap. get ( 0 ) . unwrap ( ) . start ( ) ;
161+ let match_end = start_byte + cap. get ( 0 ) . unwrap ( ) . end ( ) ;
162+ fragments. push ( LinkFragment {
163+ fragment : fragment. to_string ( ) ,
164+ range : tree_sitter:: Range {
165+ start_byte : match_start,
166+ end_byte : match_end,
167+ start_point : tree_sitter:: Point :: new ( 0 , match_start) ,
168+ end_point : tree_sitter:: Point :: new ( 0 , match_end) ,
169+ } ,
170+ } ) ;
211171 }
212172 }
213173 }
214174 }
215175 }
216176
217- None
177+ fragments
218178 }
219179
220180 fn is_github_special_fragment ( & self , fragment : & str ) -> bool {
@@ -297,6 +257,12 @@ impl RuleLinter for MD051Linter {
297257 }
298258 }
299259 }
260+ "link" => {
261+ // Extract link fragments from actual tree-sitter link nodes
262+ if let Some ( link_fragment) = self . extract_link_fragment_from_link_node ( node) {
263+ self . link_fragments . push ( link_fragment) ;
264+ }
265+ }
300266 "inline" | "html_block" => {
301267 // Extract HTML id and name attributes
302268 let ids = self . extract_html_id_or_name ( node) ;
@@ -305,8 +271,13 @@ impl RuleLinter for MD051Linter {
305271 self . valid_fragments_lowercase . insert ( id. to_lowercase ( ) ) ;
306272 }
307273
308- // Also look for links in inline content
309- let link_fragments = self . extract_link_fragments ( node) ;
274+ // We can't fully rely on tree-sitter "link" nodes for link extraction because:
275+ // 1. Tree-sitter may not always parse all link structures as "link" nodes in complex inline contexts
276+ // 2. Some markdown parsers have different interpretations of what constitutes a valid link structure
277+ // 3. Tree-sitter's parsing can vary based on surrounding context and nesting
278+ // 4. We need to ensure we catch all valid [text](#fragment) patterns regardless of how they're categorized
279+ // Therefore, we supplement tree-sitter link nodes with regex-based extraction from inline content.
280+ let link_fragments = self . extract_link_fragments_from_inline ( node) ;
310281 for link_fragment in link_fragments {
311282 self . link_fragments . push ( link_fragment) ;
312283 }
@@ -801,4 +772,54 @@ This paragraph has [valid link](#test-heading-one) and [invalid link](#nonexiste
801772 // Should have no violations - colons should be removed per GitHub spec
802773 assert_eq ! ( 0 , violations. len( ) ) ;
803774 }
775+
776+ #[ test]
777+ fn test_capital_letters ( ) {
778+ let input = "
779+ [FAQ](#FAQ)
780+ ## FAQ
781+ " ;
782+
783+ let mut config = test_config ( ) ;
784+ config. linters . settings . link_fragments . ignore_case = true ;
785+ let mut multi_linter =
786+ MultiRuleLinter :: new_for_document ( PathBuf :: from ( "test.md" ) , config, input) ;
787+ let violations = multi_linter. analyze ( ) ;
788+
789+ // Should have 0 violations - case ignored with ignore_case = true
790+ assert_eq ! ( 0 , violations. len( ) ) ;
791+ }
792+
793+ #[ test]
794+ fn test_capital_letters_case_sensitive ( ) {
795+ let input = "
796+ [FAQ](#FAQ)
797+ ## FAQ
798+ " ;
799+
800+ let mut config = test_config ( ) ;
801+ config. linters . settings . link_fragments . ignore_case = false ;
802+ let mut multi_linter =
803+ MultiRuleLinter :: new_for_document ( PathBuf :: from ( "test.md" ) , config, input) ;
804+ let violations = multi_linter. analyze ( ) ;
805+
806+ // Should have 1 violation - case mismatch with ignore_case = false
807+ assert_eq ! ( 1 , violations. len( ) ) ;
808+ assert ! ( violations[ 0 ] . message( ) . contains( "Link fragment 'FAQ'" ) ) ;
809+ }
810+
811+ #[ test]
812+ fn test_not_a_link ( ) {
813+ let input = "
814+ - [notalink] a (#491)
815+ " ;
816+
817+ let config = test_config ( ) ;
818+ let mut multi_linter =
819+ MultiRuleLinter :: new_for_document ( PathBuf :: from ( "test.md" ) , config, input) ;
820+ let violations = multi_linter. analyze ( ) ;
821+
822+ // Should have no violations - colons should be removed per GitHub spec
823+ assert_eq ! ( 0 , violations. len( ) ) ;
824+ }
804825}
0 commit comments