@@ -17,12 +17,38 @@ use crate::kb::network::{
1717 NetworkError , SsrfConfig ,
1818} ;
1919use futures:: StreamExt ;
20+ use once_cell:: sync:: Lazy ;
21+ use regex_lite:: Regex ;
2022use reqwest:: Client ;
2123use std:: collections:: HashSet ;
2224use std:: sync:: Arc ;
2325use std:: time:: Duration ;
2426use url:: Url ;
2527
28+ // HTML-scrubbing regexes. The patterns are static literals, so compilation is
29+ // infallible in practice — we panic once at first use with a descriptive
30+ // message if that invariant is ever broken, rather than recompiling and
31+ // unwrapping on every call.
32+ static HEADING_RE : Lazy < Regex > = Lazy :: new ( || {
33+ Regex :: new ( r"<h([1-6])[^>]*>([^<]+)</h[1-6]>" ) . expect ( "heading regex must compile" )
34+ } ) ;
35+ static SCRIPT_RE : Lazy < Regex > = Lazy :: new ( || {
36+ Regex :: new ( r"(?is)<script[^>]*>.*?</script>" ) . expect ( "script regex must compile" )
37+ } ) ;
38+ static STYLE_RE : Lazy < Regex > =
39+ Lazy :: new ( || Regex :: new ( r"(?is)<style[^>]*>.*?</style>" ) . expect ( "style regex must compile" ) ) ;
40+ static COMMENT_RE : Lazy < Regex > =
41+ Lazy :: new ( || Regex :: new ( r"<!--.*?-->" ) . expect ( "comment regex must compile" ) ) ;
42+ static BLOCK_ELEMENT_RE : Lazy < Regex > = Lazy :: new ( || {
43+ Regex :: new ( r"<(?:p|div|br|h[1-6]|li|tr)[^>]*>" ) . expect ( "block-element regex must compile" )
44+ } ) ;
45+ static HTML_TAG_RE : Lazy < Regex > =
46+ Lazy :: new ( || Regex :: new ( r"<[^>]+>" ) . expect ( "html-tag regex must compile" ) ) ;
47+ static WHITESPACE_RE : Lazy < Regex > =
48+ Lazy :: new ( || Regex :: new ( r"\s+" ) . expect ( "whitespace regex must compile" ) ) ;
49+ static NEWLINE_COLLAPSE_RE : Lazy < Regex > =
50+ Lazy :: new ( || Regex :: new ( r"\n\s*\n" ) . expect ( "newline-collapse regex must compile" ) ) ;
51+
2652/// Web page ingestion configuration
2753#[ derive( Debug , Clone ) ]
2854pub struct WebIngestConfig {
@@ -739,9 +765,8 @@ fn extract_html_title(html: &str) -> Option<String> {
739765/// Extract headings from HTML
740766fn extract_headings ( html : & str ) -> Vec < ( usize , String ) > {
741767 let mut headings = Vec :: new ( ) ;
742- let re = regex_lite:: Regex :: new ( r"<h([1-6])[^>]*>([^<]+)</h[1-6]>" ) . unwrap ( ) ;
743768
744- for cap in re . captures_iter ( html) {
769+ for cap in HEADING_RE . captures_iter ( html) {
745770 if let ( Some ( level) , Some ( text) ) = ( cap. get ( 1 ) , cap. get ( 2 ) ) {
746771 if let Ok ( level) = level. as_str ( ) . parse :: < usize > ( ) {
747772 let text = html_entities_decode ( text. as_str ( ) . trim ( ) ) ;
@@ -757,33 +782,25 @@ fn extract_headings(html: &str) -> Vec<(usize, String)> {
757782
758783/// Convert HTML to plain text (simple implementation)
759784fn html_to_text ( html : & str ) -> String {
760- // Remove script and style blocks
761- let re_script = regex_lite:: Regex :: new ( r"(?is)<script[^>]*>.*?</script>" ) . unwrap ( ) ;
762- let re_style = regex_lite:: Regex :: new ( r"(?is)<style[^>]*>.*?</style>" ) . unwrap ( ) ;
763- let re_comments = regex_lite:: Regex :: new ( r"<!--.*?-->" ) . unwrap ( ) ;
764-
765- let text = re_script. replace_all ( html, "" ) ;
766- let text = re_style. replace_all ( & text, "" ) ;
767- let text = re_comments. replace_all ( & text, "" ) ;
785+ // Remove script, style, and comment blocks
786+ let text = SCRIPT_RE . replace_all ( html, "" ) ;
787+ let text = STYLE_RE . replace_all ( & text, "" ) ;
788+ let text = COMMENT_RE . replace_all ( & text, "" ) ;
768789
769790 // Replace block elements with newlines
770- let re_blocks = regex_lite:: Regex :: new ( r"<(?:p|div|br|h[1-6]|li|tr)[^>]*>" ) . unwrap ( ) ;
771- let text = re_blocks. replace_all ( & text, "\n " ) ;
791+ let text = BLOCK_ELEMENT_RE . replace_all ( & text, "\n " ) ;
772792
773793 // Remove all remaining tags
774- let re_tags = regex_lite:: Regex :: new ( r"<[^>]+>" ) . unwrap ( ) ;
775- let text = re_tags. replace_all ( & text, "" ) ;
794+ let text = HTML_TAG_RE . replace_all ( & text, "" ) ;
776795
777796 // Decode HTML entities
778797 let text = html_entities_decode ( & text) ;
779798
780799 // Normalize whitespace
781- let re_whitespace = regex_lite:: Regex :: new ( r"\s+" ) . unwrap ( ) ;
782- let text = re_whitespace. replace_all ( & text, " " ) ;
800+ let text = WHITESPACE_RE . replace_all ( & text, " " ) ;
783801
784802 // Normalize newlines
785- let re_newlines = regex_lite:: Regex :: new ( r"\n\s*\n" ) . unwrap ( ) ;
786- let text = re_newlines. replace_all ( & text, "\n \n " ) ;
803+ let text = NEWLINE_COLLAPSE_RE . replace_all ( & text, "\n \n " ) ;
787804
788805 text. trim ( ) . to_string ( )
789806}
0 commit comments