Skip to content

Commit e3355ef

Browse files
authored
Merge pull request #65 from saagpatel/codex/fix/rust-panic-cleanup
fix(src): hoist web-ingest regexes and document unsafe mach FFI
2 parents 04473dc + e8d9e16 commit e3355ef

2 files changed

Lines changed: 52 additions & 18 deletions

File tree

src-tauri/src/diagnostics.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,23 @@ fn get_process_memory_bytes() -> u64 {
650650
) -> KernReturn;
651651
}
652652

653+
// SAFETY: Standard Mach FFI pattern for introspecting the current task.
654+
// - `mach_task_self()` returns the calling task's own port send right; it
655+
// takes no inputs and cannot fail for an in-process query.
656+
// - `task_info` is called with MACH_TASK_BASIC_INFO (flavor 20), whose
657+
// out-struct is `task_basic_info_64_data_t`. We pass a matching
658+
// `MachTaskBasicInfo` buffer and the correctly computed
659+
// MACH_TASK_BASIC_INFO_COUNT, so the kernel writes exactly the fields
660+
// we declared.
661+
// - The buffer is allocated via `MaybeUninit::<_>::zeroed()`, which is
662+
// correctly aligned for the struct, and MachTaskBasicInfo is composed
663+
// entirely of plain-old-data integer types for which all-zero is a
664+
// valid bit pattern — so even on a partial kernel write nothing reads
665+
// uninitialized memory.
666+
// - `assume_init()` is only called on the success path (kr == 0), after
667+
// the kernel guarantees it has fully populated the struct.
668+
// - On failure we return 0 without touching `info`, so no uninitialized
669+
// data ever escapes.
653670
unsafe {
654671
let mut info = MaybeUninit::<MachTaskBasicInfo>::zeroed();
655672
let mut count = MACH_TASK_BASIC_INFO_COUNT;

src-tauri/src/kb/ingest/web.rs

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,38 @@ use crate::kb::network::{
1717
NetworkError, SsrfConfig,
1818
};
1919
use futures::StreamExt;
20+
use once_cell::sync::Lazy;
21+
use regex_lite::Regex;
2022
use reqwest::Client;
2123
use std::collections::HashSet;
2224
use std::sync::Arc;
2325
use std::time::Duration;
2426
use url::Url;
2527

28+
// HTML-scrubbing regexes. The patterns are static literals, so compilation is
29+
// infallible in practice — we panic once at first use with a descriptive
30+
// message if that invariant is ever broken, rather than recompiling and
31+
// unwrapping on every call.
32+
static HEADING_RE: Lazy<Regex> = Lazy::new(|| {
33+
Regex::new(r"<h([1-6])[^>]*>([^<]+)</h[1-6]>").expect("heading regex must compile")
34+
});
35+
static SCRIPT_RE: Lazy<Regex> = Lazy::new(|| {
36+
Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("script regex must compile")
37+
});
38+
static STYLE_RE: Lazy<Regex> =
39+
Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("style regex must compile"));
40+
static COMMENT_RE: Lazy<Regex> =
41+
Lazy::new(|| Regex::new(r"<!--.*?-->").expect("comment regex must compile"));
42+
static BLOCK_ELEMENT_RE: Lazy<Regex> = Lazy::new(|| {
43+
Regex::new(r"<(?:p|div|br|h[1-6]|li|tr)[^>]*>").expect("block-element regex must compile")
44+
});
45+
static HTML_TAG_RE: Lazy<Regex> =
46+
Lazy::new(|| Regex::new(r"<[^>]+>").expect("html-tag regex must compile"));
47+
static WHITESPACE_RE: Lazy<Regex> =
48+
Lazy::new(|| Regex::new(r"\s+").expect("whitespace regex must compile"));
49+
static NEWLINE_COLLAPSE_RE: Lazy<Regex> =
50+
Lazy::new(|| Regex::new(r"\n\s*\n").expect("newline-collapse regex must compile"));
51+
2652
/// Web page ingestion configuration
2753
#[derive(Debug, Clone)]
2854
pub struct WebIngestConfig {
@@ -739,9 +765,8 @@ fn extract_html_title(html: &str) -> Option<String> {
739765
/// Extract headings from HTML
740766
fn extract_headings(html: &str) -> Vec<(usize, String)> {
741767
let mut headings = Vec::new();
742-
let re = regex_lite::Regex::new(r"<h([1-6])[^>]*>([^<]+)</h[1-6]>").unwrap();
743768

744-
for cap in re.captures_iter(html) {
769+
for cap in HEADING_RE.captures_iter(html) {
745770
if let (Some(level), Some(text)) = (cap.get(1), cap.get(2)) {
746771
if let Ok(level) = level.as_str().parse::<usize>() {
747772
let text = html_entities_decode(text.as_str().trim());
@@ -757,33 +782,25 @@ fn extract_headings(html: &str) -> Vec<(usize, String)> {
757782

758783
/// Convert HTML to plain text (simple implementation)
759784
fn html_to_text(html: &str) -> String {
760-
// Remove script and style blocks
761-
let re_script = regex_lite::Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap();
762-
let re_style = regex_lite::Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap();
763-
let re_comments = regex_lite::Regex::new(r"<!--.*?-->").unwrap();
764-
765-
let text = re_script.replace_all(html, "");
766-
let text = re_style.replace_all(&text, "");
767-
let text = re_comments.replace_all(&text, "");
785+
// Remove script, style, and comment blocks
786+
let text = SCRIPT_RE.replace_all(html, "");
787+
let text = STYLE_RE.replace_all(&text, "");
788+
let text = COMMENT_RE.replace_all(&text, "");
768789

769790
// Replace block elements with newlines
770-
let re_blocks = regex_lite::Regex::new(r"<(?:p|div|br|h[1-6]|li|tr)[^>]*>").unwrap();
771-
let text = re_blocks.replace_all(&text, "\n");
791+
let text = BLOCK_ELEMENT_RE.replace_all(&text, "\n");
772792

773793
// Remove all remaining tags
774-
let re_tags = regex_lite::Regex::new(r"<[^>]+>").unwrap();
775-
let text = re_tags.replace_all(&text, "");
794+
let text = HTML_TAG_RE.replace_all(&text, "");
776795

777796
// Decode HTML entities
778797
let text = html_entities_decode(&text);
779798

780799
// Normalize whitespace
781-
let re_whitespace = regex_lite::Regex::new(r"\s+").unwrap();
782-
let text = re_whitespace.replace_all(&text, " ");
800+
let text = WHITESPACE_RE.replace_all(&text, " ");
783801

784802
// Normalize newlines
785-
let re_newlines = regex_lite::Regex::new(r"\n\s*\n").unwrap();
786-
let text = re_newlines.replace_all(&text, "\n\n");
803+
let text = NEWLINE_COLLAPSE_RE.replace_all(&text, "\n\n");
787804

788805
text.trim().to_string()
789806
}

0 commit comments

Comments
 (0)