@@ -7,6 +7,7 @@ use clap::Parser;
77use cortex_engine:: create_client_builder;
88use reqwest:: Client ;
99use scraper:: { Html , Selector } ;
10+ use std:: borrow:: Cow ;
1011use std:: collections:: HashMap ;
1112use std:: io:: Write ;
1213use std:: path:: PathBuf ;
@@ -101,6 +102,107 @@ pub struct ScrapeCommand {
101102 pub verbose : bool ,
102103}
103104
105+ /// Decode common HTML entities in URLs (#2449).
106+ ///
107+ /// HTML attributes like href often contain encoded entities like `&` for `&`.
108+ /// This function decodes the most common HTML entities to produce valid URLs.
109+ fn decode_html_entities ( text : & str ) -> Cow < ' _ , str > {
110+ // Quick check: if no & character, return as-is
111+ if !text. contains ( '&' ) {
112+ return Cow :: Borrowed ( text) ;
113+ }
114+
115+ let mut result = String :: with_capacity ( text. len ( ) ) ;
116+ let mut chars = text. chars ( ) . peekable ( ) ;
117+
118+ while let Some ( c) = chars. next ( ) {
119+ if c == '&' {
120+ // Collect entity name until ; or end
121+ let mut entity = String :: new ( ) ;
122+ let mut found_semi = false ;
123+
124+ while let Some ( & next) = chars. peek ( ) {
125+ if next == ';' {
126+ chars. next ( ) ; // consume semicolon
127+ found_semi = true ;
128+ break ;
129+ }
130+ if next == '&' || ( !next. is_ascii_alphanumeric ( ) && next != '#' ) {
131+ // Not a valid entity, stop collecting
132+ break ;
133+ }
134+ entity. push ( chars. next ( ) . unwrap ( ) ) ;
135+ // Limit entity length to prevent DoS
136+ if entity. len ( ) > 10 {
137+ break ;
138+ }
139+ }
140+
141+ if found_semi {
142+ // Decode known entities
143+ let decoded = match entity. as_str ( ) {
144+ "amp" => "&" ,
145+ "lt" => "<" ,
146+ "gt" => ">" ,
147+ "quot" => "\" " ,
148+ "apos" => "'" ,
149+ "nbsp" => " " ,
150+ "#38" => "&" ,
151+ "#60" => "<" ,
152+ "#62" => ">" ,
153+ "#34" => "\" " ,
154+ "#39" => "'" ,
155+ _ => {
156+ // Unknown entity, keep as-is
157+ result. push ( '&' ) ;
158+ result. push_str ( & entity) ;
159+ result. push ( ';' ) ;
160+ continue ;
161+ }
162+ } ;
163+ result. push_str ( decoded) ;
164+ } else {
165+ // Not a valid entity, output what we collected
166+ result. push ( '&' ) ;
167+ result. push_str ( & entity) ;
168+ }
169+ } else {
170+ result. push ( c) ;
171+ }
172+ }
173+
174+ Cow :: Owned ( result)
175+ }
176+
177+ /// Validate and sanitize URL for security (#2448).
178+ ///
179+ /// Rejects URLs containing:
180+ /// - Control characters (including null bytes) which could enable request smuggling
181+ /// - Invalid percent-encoded sequences
182+ fn validate_url_security ( url : & str ) -> Result < ( ) > {
183+ // Check for control characters (ASCII 0-31, 127)
184+ for ( i, c) in url. chars ( ) . enumerate ( ) {
185+ if c. is_control ( ) {
186+ bail ! (
187+ "URL contains control character at position {} (code: {}). \
188+ Control characters in URLs can enable HTTP request smuggling attacks.",
189+ i,
190+ c as u32
191+ ) ;
192+ }
193+ }
194+
195+ // Check for null bytes in percent-encoded form
196+ if url. contains ( "%00" ) || url. contains ( "%0" ) {
197+ bail ! (
198+ "URL contains percent-encoded null byte (%00). \
199+ This can cause security vulnerabilities."
200+ ) ;
201+ }
202+
203+ Ok ( ( ) )
204+ }
205+
104206impl ScrapeCommand {
105207 /// Run the scrape command.
106208 pub async fn run ( self ) -> Result < ( ) > {
@@ -109,6 +211,9 @@ impl ScrapeCommand {
109211 bail ! ( "URL cannot be empty" ) ;
110212 }
111213
214+ // Validate URL for security (control characters, etc.) (#2448)
215+ validate_url_security ( & self . url ) ?;
216+
112217 // Parse output format
113218 let format: OutputFormat = self . format . parse ( ) ?;
114219
@@ -654,7 +759,9 @@ fn process_node_to_markdown(
654759 no_links,
655760 ) ;
656761 } else {
657- let href = elem. attr ( "href" ) . unwrap_or ( "" ) ;
762+ let href_raw = elem. attr ( "href" ) . unwrap_or ( "" ) ;
763+ // Decode HTML entities in href (e.g., & -> &) (#2449)
764+ let href = decode_html_entities ( href_raw) ;
658765 output. push ( '[' ) ;
659766 process_node_to_markdown (
660767 element_ref,
@@ -1240,4 +1347,66 @@ mod tests {
12401347 "Client should build successfully with timeout 0"
12411348 ) ;
12421349 }
1350+
1351+ #[ test]
1352+ fn test_html_to_markdown_links_with_html_entities ( ) {
1353+ // Test that HTML entities in URLs are decoded properly (Issue #2449)
1354+ let html = r#"<a href="/search?q=test&page=2">Next</a>"# ;
1355+ let md = html_to_markdown ( html, false , false ) ;
1356+ assert ! (
1357+ md. contains( "[Next](/search?q=test&page=2)" ) ,
1358+ "Expected decoded URL with &, got: {}" ,
1359+ md
1360+ ) ;
1361+
1362+ // Multiple entities
1363+ let html = r#"<a href="/path?a=1&b=2&c=3">Link</a>"# ;
1364+ let md = html_to_markdown ( html, false , false ) ;
1365+ assert ! ( md. contains( "/path?a=1&b=2&c=3" ) ) ;
1366+ }
1367+
1368+ #[ test]
1369+ fn test_decode_html_entities ( ) {
1370+ // Basic entities
1371+ assert_eq ! ( decode_html_entities( "&" ) . as_ref( ) , "&" ) ;
1372+ assert_eq ! ( decode_html_entities( "<" ) . as_ref( ) , "<" ) ;
1373+ assert_eq ! ( decode_html_entities( ">" ) . as_ref( ) , ">" ) ;
1374+ assert_eq ! ( decode_html_entities( """ ) . as_ref( ) , "\" " ) ;
1375+
1376+ // URL with multiple entities
1377+ assert_eq ! (
1378+ decode_html_entities( "/search?q=test&page=2" ) . as_ref( ) ,
1379+ "/search?q=test&page=2"
1380+ ) ;
1381+
1382+ // No entities - should return borrowed
1383+ let result = decode_html_entities ( "plain text" ) ;
1384+ assert ! ( matches!( result, Cow :: Borrowed ( _) ) ) ;
1385+
1386+ // Mixed content
1387+ assert_eq ! (
1388+ decode_html_entities( "before & after" ) . as_ref( ) ,
1389+ "before & after"
1390+ ) ;
1391+
1392+ // Incomplete entity (no semicolon)
1393+ assert_eq ! ( decode_html_entities( "& text" ) . as_ref( ) , "& text" ) ;
1394+
1395+ // Unknown entity
1396+ assert_eq ! ( decode_html_entities( "&unknown;" ) . as_ref( ) , "&unknown;" ) ;
1397+ }
1398+
1399+ #[ test]
1400+ fn test_validate_url_security ( ) {
1401+ // Valid URLs should pass
1402+ assert ! ( validate_url_security( "https://example.com/path" ) . is_ok( ) ) ;
1403+ assert ! ( validate_url_security( "https://example.com/search?q=test&page=2" ) . is_ok( ) ) ;
1404+
1405+ // URLs with control characters should fail
1406+ assert ! ( validate_url_security( "https://example.com/\0 path" ) . is_err( ) ) ;
1407+ assert ! ( validate_url_security( "https://example.com/\n path" ) . is_err( ) ) ;
1408+
1409+ // URLs with percent-encoded null bytes should fail
1410+ assert ! ( validate_url_security( "https://example.com/%00path" ) . is_err( ) ) ;
1411+ }
12431412}
0 commit comments