fix: 8 bugs in security detection — UTF-8 panic, false positives, detection gaps

haasonsaas · claude · haasonsaas · commit 5bf1ca732e1d · 2026-03-11T21:42:18.000-07:00
- Fix redact() panic on multi-byte UTF-8 by using char_indices() for safe boundaries
- Fix RE_JWT matching literal backslash (not valid in JWT tokens)
- Fix connection string regex missing postgresql:// scheme (SQLAlchemy standard)
- Fix "des " false positive matching "nodes ", "codes ", etc. — use word boundaries
- Fix CWE tag extraction only finding first occurrence — now extracts all
- Fix generate_code_suggestion matching "use" inside "because"/"refuse"/etc.
- Remove dead Option wrapper from process_raw_comment (always returned Some)
- Fix "doc" miscategorizing "docker" as Documentation — use "documentation"/"docstring"
- Tighten overly-broad security keywords: missing header, file upload, input validation

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/core/comment.rs b/src/core/comment.rs
@@ -132,9 +132,7 @@ impl CommentSynthesizer {
         let mut comments = Vec::new();
 
         for raw in raw_comments {
-            if let Some(comment) = Self::process_raw_comment(raw)? {
-                comments.push(comment);
-            }
+            comments.push(Self::process_raw_comment(raw)?);
         }
 
         Self::deduplicate_comments(&mut comments);
@@ -177,7 +175,7 @@ impl CommentSynthesizer {
         }
     }
 
-    fn process_raw_comment(raw: RawComment) -> Result<Option<Comment>> {
+    fn process_raw_comment(raw: RawComment) -> Result<Comment> {
         let lower = raw.content.to_lowercase();
         let severity = raw
             .severity
@@ -203,7 +201,7 @@ impl CommentSynthesizer {
         let code_suggestion = Self::generate_code_suggestion(&raw);
         let id = Self::generate_comment_id(&raw.file_path, &raw.content, &category);
 
-        Ok(Some(Comment {
+        Ok(Comment {
             id,
             file_path: raw.file_path,
             line_number: raw.line_number,
@@ -217,7 +215,7 @@ impl CommentSynthesizer {
             tags,
             fix_effort,
             feedback: None,
-        }))
+        })
     }
 
     fn generate_comment_id(file_path: &Path, content: &str, category: &Category) -> String {
@@ -306,7 +304,7 @@ impl CommentSynthesizer {
             || lower.contains("verbose error")
             || lower.contains("information disclosure")
             || lower.contains("security header")
-            || lower.contains("missing header")
+            || lower.contains("missing security header")
             // Unsafe code patterns
             || lower.contains("unsafe block")
             || lower.contains("unsafe {")
@@ -331,9 +329,10 @@ impl CommentSynthesizer {
             || lower.contains("no pagination")
             || lower.contains("unbounded query")
             || lower.contains("graphql depth")
-            || lower.contains("file upload")
             || lower.contains("insecure upload")
-            || lower.contains("input validation")
+            || lower.contains("unrestricted upload")
+            || (lower.contains("file upload") && (lower.contains("insecure") || lower.contains("unrestricted") || lower.contains("vulnerability") || lower.contains("security")))
+            || (lower.contains("input validation") && (lower.contains("missing") || lower.contains("vulnerability") || lower.contains("security") || lower.contains("injection")))
         {
             Category::Security
         } else if lower.contains("performance")
@@ -345,7 +344,10 @@ impl CommentSynthesizer {
             Category::Bug
         } else if lower.contains("style") || lower.contains("format") || lower.contains("naming") {
             Category::Style
-        } else if lower.contains("doc") || lower.contains("comment") {
+        } else if lower.contains("documentation")
+            || lower.contains("docstring")
+            || lower.contains("comment")
+        {
             Category::Documentation
         } else if lower.contains("test") || lower.contains("coverage") {
             Category::Testing
@@ -460,7 +462,9 @@ impl CommentSynthesizer {
 
         // ── Cryptography ──
         if lower.contains("weak cipher")
-            || lower.contains("des ")
+            || lower.contains(" des ")
+            || lower.starts_with("des ")
+            || lower.contains("3des")
             || lower.contains("rc4")
             || lower.contains("ecb mode")
         {
@@ -699,7 +703,9 @@ impl CommentSynthesizer {
 
         // ── Cryptography tags ──
         if lower.contains("weak cipher")
-            || lower.contains("des ")
+            || lower.contains(" des ")
+            || lower.starts_with("des ")
+            || lower.contains("3des")
             || lower.contains("rc4")
             || lower.contains("blowfish")
         {
@@ -743,7 +749,7 @@ impl CommentSynthesizer {
         if lower.contains("debug mode") {
             tags.push("debug-mode".to_string());
         }
-        if lower.contains("security header") || lower.contains("missing header") {
+        if lower.contains("security header") || lower.contains("missing security header") {
             tags.push("security-headers".to_string());
         }
         if lower.contains("information disclosure") || lower.contains("data exposure") {
@@ -823,15 +829,20 @@ impl CommentSynthesizer {
         }
 
         // ── CWE / OWASP tags ──
-        // Extract CWE numbers from content
-        if let Some(pos) = lower.find("cwe-") {
-            let cwe_rest = &lower[pos..];
-            let cwe_tag: String = cwe_rest
-                .chars()
-                .take_while(|c| c.is_alphanumeric() || *c == '-')
-                .collect();
-            if cwe_tag.len() > 4 {
-                tags.push(cwe_tag);
+        // Extract all CWE numbers from content
+        {
+            let mut search_from = 0;
+            while let Some(offset) = lower[search_from..].find("cwe-") {
+                let pos = search_from + offset;
+                let cwe_rest = &lower[pos..];
+                let cwe_tag: String = cwe_rest
+                    .chars()
+                    .take_while(|c| c.is_alphanumeric() || *c == '-')
+                    .collect();
+                if cwe_tag.len() > 4 && !tags.contains(&cwe_tag) {
+                    tags.push(cwe_tag);
+                }
+                search_from = pos + 4; // skip past "cwe-" to find next
             }
         }
 
@@ -898,7 +909,10 @@ impl CommentSynthesizer {
 
         // Fallback: generate a basic suggestion from the textual suggestion field
         if let Some(suggestion) = &raw.suggestion {
-            if suggestion.contains("use") || suggestion.contains("replace") {
+            let has_action_word = suggestion
+                .split_whitespace()
+                .any(|w| w.eq_ignore_ascii_case("use") || w.eq_ignore_ascii_case("replace"));
+            if has_action_word {
                 return Some(CodeSuggestion {
                     original_code: "// Original code would be extracted from context".to_string(),
                     suggested_code: suggestion.clone(),
diff --git a/src/plugins/builtin/secret_scanner.rs b/src/plugins/builtin/secret_scanner.rs
@@ -57,8 +57,7 @@ static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)(-----BEGIN[ A-Z0-9_-]{0,100}PRIVATE KEY(?:\sBLOCK)?-----)").unwrap()
 });
 static RE_JWT: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"\b(ey[a-zA-Z0-9]{17,}\.ey[a-zA-Z0-9/\\_\-]{17,}\.[a-zA-Z0-9/\\_\-]{10,}=*)\b")
-        .unwrap()
+    Regex::new(r"\b(ey[a-zA-Z0-9]{17,}\.ey[a-zA-Z0-9/_-]{17,}\.[a-zA-Z0-9/_-]{10,}=*)\b").unwrap()
 });
 static RE_GCP_KEY: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b(AIza[\w\-]{35})\b").unwrap());
 static RE_GCP_SA: Lazy<Regex> =
@@ -73,7 +72,7 @@ static RE_SENDGRID: Lazy<Regex> =
 static RE_TWILIO: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b(SK[0-9a-fA-F]{32})\b").unwrap());
 static RE_NPM: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b(npm_[a-z0-9]{36})\b").unwrap());
 static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(?i)((?:postgres|mysql|mongodb|redis|amqp|mssql)://[^:\s]+:[^@\s]+@[^\s]+)")
+    Regex::new(r"(?i)((?:postgres(?:ql)?|mysql|mongodb|redis|amqp|mssql)://[^:\s]+:[^@\s]+@[^\s]+)")
         .unwrap()
 });
 static RE_GENERIC_CRED: Lazy<Regex> = Lazy::new(|| {
@@ -391,11 +390,21 @@ fn is_false_positive(value: &str) -> bool {
 
 /// Redact a secret value, showing only the first few and last few chars.
 fn redact(value: &str) -> String {
-    if value.len() <= 8 {
-        return "*".repeat(value.len());
-    }
-    let show = 4.min(value.len() / 4);
-    format!("{}...{}", &value[..show], &value[value.len() - show..])
+    let char_count = value.chars().count();
+    if char_count <= 8 {
+        return "*".repeat(char_count);
+    }
+    let show = 4.min(char_count / 4);
+    // Use char_indices to find safe byte boundaries for multi-byte UTF-8
+    let prefix_end = value
+        .char_indices()
+        .nth(show)
+        .map_or(value.len(), |(i, _)| i);
+    let suffix_start = value
+        .char_indices()
+        .nth(char_count - show)
+        .map_or(value.len(), |(i, _)| i);
+    format!("{}...{}", &value[..prefix_end], &value[suffix_start..])
 }
 
 pub struct SecretScanner;
@@ -569,6 +578,19 @@ mod tests {
         assert_eq!(findings[0].rule_id, "sec.secrets.connection-string");
     }
 
+    #[test]
+    fn test_detects_postgresql_connection_string() {
+        let findings = SecretScanner::scan_line(
+            "DATABASE_URL=postgresql://admin:supersecret@db.example.com:5432/mydb",
+            3,
+        );
+        assert!(
+            !findings.is_empty(),
+            "Should detect postgresql:// connection string"
+        );
+        assert_eq!(findings[0].rule_id, "sec.secrets.connection-string");
+    }
+
     #[test]
     fn test_ignores_placeholder() {
         let findings = SecretScanner::scan_line("password = \"your-secret-here\"", 1);
@@ -619,6 +641,13 @@ mod tests {
         assert!(redacted.contains("..."));
     }
 
+    #[test]
+    fn test_redact_multibyte_utf8() {
+        // Must not panic on multi-byte UTF-8 characters
+        let redacted = redact("pässwörd_töken_sëcret_välue_here");
+        assert!(redacted.contains("..."));
+    }
+
     #[tokio::test]
     async fn test_scanner_only_scans_added_lines() {
         let diff = make_diff_with_lines(