feat: normalize space-separated SAM headers in web UI (#13)

nh13 · web-flow · commit 37d1db7211f4 · 2026-03-26T13:19:42.000-07:00
* feat: normalize space-separated SAM header lines to tabs

* feat: detect space-separated SAM headers in format detection

* feat: propagate whitespace normalization warning in API response

* feat: display whitespace normalization warning in web UI

* refactor: use Cow to avoid allocation, extract is_sam_record helper, move warning styles to CSS
diff --git a/src/parsing/sam.rs b/src/parsing/sam.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::io::BufReader;
 use std::path::Path;
 use thiserror::Error;
@@ -182,6 +183,83 @@ fn header_to_query(
     Ok(query)
 }
 
+/// Normalize SAM header lines that use spaces instead of tabs.
+///
+/// Browsers and copy-paste often convert tabs to spaces. This function detects
+/// SAM header lines (`@XX` prefix) where fields are space-separated instead of
+/// tab-separated and converts the spaces to tabs.
+///
+/// Only normalizes lines that start with a SAM header record type (`@HD`, `@SQ`,
+/// `@RG`, `@PG`) followed by a space and a TAG: pattern. Lines that already
+/// contain tabs are left unchanged. `@CO` (comment) lines are not normalized
+/// because their content is free-form text where spaces are meaningful.
+///
+/// Returns the (possibly normalized) text and a boolean indicating whether any
+/// normalization was performed. Uses `Cow` to avoid allocation when no
+/// normalization is needed (the common case for well-formed input).
+#[must_use]
+pub fn normalize_sam_whitespace(text: &str) -> (Cow<'_, str>, bool) {
+    // Fast path: check if any line needs normalization before allocating
+    if !text.lines().any(needs_space_to_tab_normalization) {
+        return (Cow::Borrowed(text), false);
+    }
+
+    let mut normalized = String::with_capacity(text.len());
+
+    for line in text.lines() {
+        if !normalized.is_empty() {
+            normalized.push('\n');
+        }
+
+        if needs_space_to_tab_normalization(line) {
+            let mut first = true;
+            for field in line.split_whitespace() {
+                if first {
+                    normalized.push_str(field);
+                    first = false;
+                } else {
+                    normalized.push('\t');
+                    normalized.push_str(field);
+                }
+            }
+        } else {
+            normalized.push_str(line);
+        }
+    }
+
+    // Preserve trailing newline if present
+    if text.ends_with('\n') {
+        normalized.push('\n');
+    }
+
+    (Cow::Owned(normalized), true)
+}
+
+/// Check if a line is a SAM header line that uses spaces instead of tabs.
+///
+/// Returns true only when the line starts with a recognized SAM record type
+/// followed by whitespace and a TAG: pattern, and the line contains NO tab
+/// characters.
+fn needs_space_to_tab_normalization(line: &str) -> bool {
+    if line.contains('\t') {
+        return false;
+    }
+
+    // @CO lines are free-form comments — do not normalize
+    let sam_prefixes = ["@HD ", "@SQ ", "@RG ", "@PG "];
+    if !sam_prefixes.iter().any(|p| line.starts_with(p)) {
+        return false;
+    }
+
+    // Must have at least one TAG:VALUE pattern after the record type
+    line.split_whitespace().skip(1).any(|field| {
+        field.len() >= 3
+            && field.as_bytes().get(2) == Some(&b':')
+            && field.as_bytes()[0].is_ascii_uppercase()
+            && field.as_bytes()[1].is_ascii_uppercase()
+    })
+}
+
 /// Parse header from raw text (stdin or pasted)
 ///
 /// # Errors
@@ -190,6 +268,8 @@ fn header_to_query(
 /// required fields, or no contigs are found, or `ParseError::TooManyContigs`
 /// if the limit is exceeded.
 pub fn parse_header_text(text: &str) -> Result<QueryHeader, ParseError> {
+    let (normalized_text, _) = normalize_sam_whitespace(text);
+    let text = &normalized_text;
     let mut contigs = Vec::new();
 
     for line in text.lines() {
@@ -331,4 +411,86 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_normalize_sam_whitespace_spaces_to_tabs() {
+        let input = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(was_normalized);
+        assert_eq!(
+            normalized,
+            "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n"
+        );
+    }
+
+    #[test]
+    fn test_normalize_sam_whitespace_already_tabs() {
+        let input = "@SQ\tSN:chr1\tLN:248956422\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(!was_normalized);
+        assert_eq!(normalized, input);
+    }
+
+    #[test]
+    fn test_normalize_sam_whitespace_mixed_lines() {
+        let input =
+            "@HD VN:1.6 SO:coordinate\n@SQ SN:chr1 LN:248956422\n@SQ SN:chr2 LN:242193529\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(was_normalized);
+        assert!(normalized.contains("@HD\tVN:1.6\tSO:coordinate"));
+        assert!(normalized.contains("@SQ\tSN:chr1\tLN:248956422"));
+        assert!(normalized.contains("@SQ\tSN:chr2\tLN:242193529"));
+    }
+
+    #[test]
+    fn test_normalize_sam_whitespace_multiple_spaces() {
+        let input = "@SQ  SN:chr1  LN:248956422\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(was_normalized);
+        assert_eq!(normalized, "@SQ\tSN:chr1\tLN:248956422\n");
+    }
+
+    #[test]
+    fn test_normalize_sam_whitespace_preserves_non_header_lines() {
+        let input = "some random text with spaces\n@SQ SN:chr1 LN:100\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(was_normalized);
+        assert!(normalized.starts_with("some random text with spaces\n"));
+        assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
+    }
+
+    #[test]
+    fn test_normalize_sam_whitespace_tabs_and_spaces_mixed() {
+        // Line has some tabs and some spaces — leave it alone
+        let input = "@SQ\tSN:chr1 LN:248956422\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(!was_normalized);
+        assert_eq!(normalized, input);
+    }
+
+    #[test]
+    fn test_normalize_sam_whitespace_skips_comment_lines() {
+        let input = "@CO This is a comment with VN:1.0 mentioned\n@SQ SN:chr1 LN:100\n";
+        let (normalized, was_normalized) = normalize_sam_whitespace(input);
+        assert!(was_normalized);
+        // Comment line should be preserved as-is
+        assert!(normalized.starts_with("@CO This is a comment with VN:1.0 mentioned\n"));
+        assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
+    }
+
+    #[test]
+    fn test_parse_header_text_with_spaces() {
+        let header = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n\
+                      @SQ SN:chr2 LN:242193529\n";
+        let query = parse_header_text(header).unwrap();
+        assert_eq!(query.contigs.len(), 2);
+        assert_eq!(query.contigs[0].name, "chr1");
+        assert_eq!(query.contigs[0].length, 248_956_422);
+        assert_eq!(
+            query.contigs[0].md5,
+            Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
+        );
+        assert_eq!(query.contigs[1].name, "chr2");
+        assert_eq!(query.contigs[1].length, 242_193_529);
+    }
 }
diff --git a/src/web/format_detection.rs b/src/web/format_detection.rs
@@ -135,6 +135,17 @@ fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
     }
 }
 
+/// Check if a line starts with a SAM record type prefix followed by a tab or space.
+///
+/// SAM headers use tabs as delimiters, but copy-pasted text often has spaces instead.
+fn is_sam_record(line: &str, prefix: &str) -> bool {
+    line.starts_with(prefix)
+        && line
+            .as_bytes()
+            .get(prefix.len())
+            .is_some_and(|&b| b == b'\t' || b == b' ')
+}
+
 /// Detect format from file content analysis
 fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
     let content_trimmed = content.trim();
@@ -155,14 +166,14 @@ fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError>
     let lines: Vec<&str> = content_trimmed.lines().take(20).collect(); // Sample first 20 lines
 
     // Picard dictionary: starts with @HD and has @SQ lines (check BEFORE Sam)
-    if lines.iter().any(|line| line.starts_with("@HD\t"))
-        && lines.iter().any(|line| line.starts_with("@SQ\t"))
+    if lines.iter().any(|line| is_sam_record(line, "@HD"))
+        && lines.iter().any(|line| is_sam_record(line, "@SQ"))
     {
         return Ok(FileFormat::Dict);
     }
 
     // SAM header format: starts with @SQ lines
-    if lines.iter().any(|line| line.starts_with("@SQ\t")) {
+    if lines.iter().any(|line| is_sam_record(line, "@SQ")) {
         return Ok(FileFormat::Sam);
     }
 
@@ -555,6 +566,26 @@ mod tests {
         assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
     }
 
+    #[test]
+    fn test_sam_header_detection_with_spaces() {
+        let content = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
+        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
+    }
+
+    #[test]
+    fn test_dict_detection_with_spaces() {
+        let content = "@HD VN:1.0 SO:coordinate\n@SQ SN:chr1 LN:248956422\n";
+        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
+    }
+
+    #[test]
+    fn test_sam_validation_with_spaces() {
+        assert!(validate_format_content(
+            "@SQ SN:chr1 LN:123",
+            &FileFormat::Sam
+        ));
+    }
+
     #[test]
     fn test_combined_detection() {
         let content = "@SQ\tSN:chr1\tLN:248956422\n";
diff --git a/src/web/server.rs b/src/web/server.rs
@@ -327,8 +327,8 @@ async fn identify_handler(
     };
 
     // Parse input using intelligent format detection
-    let query = match parse_input_data(&input_data) {
-        Ok(query) => query,
+    let (query, parse_warnings) = match parse_input_data(&input_data) {
+        Ok(result) => result,
         Err(error_response) => return *error_response,
     };
 
@@ -437,6 +437,7 @@ async fn identify_handler(
             "md5_coverage": query.md5_coverage(),
             "naming_convention": format!("{:?}", query.naming_convention),
         },
+        "warnings": parse_warnings,
         "matches": results,
         "processing_info": {
             "detected_format": input_data.format.as_ref().map_or("unknown", super::format_detection::FileFormat::display_name),
@@ -975,11 +976,27 @@ async fn extract_request_data(
     Ok((input_data, config))
 }
 
-/// Parse input data using intelligent format detection
+/// Parse input data using intelligent format detection.
+///
+/// Returns the parsed query header and a list of warnings (e.g. whitespace normalization).
 fn parse_input_data(
     input_data: &InputData,
-) -> Result<crate::core::header::QueryHeader, Box<Response>> {
+) -> Result<(crate::core::header::QueryHeader, Vec<String>), Box<Response>> {
+    let mut warnings: Vec<String> = Vec::new();
+
     if let Some(text_content) = &input_data.text_content {
+        // Normalize space-separated SAM headers before detection and parsing
+        let (normalized_content, was_normalized) =
+            crate::parsing::sam::normalize_sam_whitespace(text_content);
+        if was_normalized {
+            warnings.push(
+                "Input contained spaces instead of tabs between SAM header fields. \
+                 Fields were automatically converted to tab-separated format."
+                    .to_string(),
+            );
+        }
+        let text_content = &normalized_content;
+
         // Text-based parsing with format detection
         let Ok(detected_format) = detect_format(text_content, input_data.filename.as_deref())
         else {
@@ -997,7 +1014,7 @@ fn parse_input_data(
         };
 
         match parse_with_format(text_content, detected_format) {
-            Ok(query) => Ok(query),
+            Ok(query) => Ok((query, warnings)),
             Err(_) => Err(Box::new((
                 StatusCode::BAD_REQUEST,
                 Json(create_safe_error_response(
@@ -1013,7 +1030,7 @@ fn parse_input_data(
         let format = input_data.format.unwrap_or(FileFormat::Bam);
 
         match parse_binary_file(binary_content, format) {
-            Ok(query) => Ok(query),
+            Ok(query) => Ok((query, Vec::new())),
             Err(_) => Err(Box::new((
                 StatusCode::BAD_REQUEST,
                 Json(create_safe_error_response(
diff --git a/src/web/static/css/styles.css b/src/web/static/css/styles.css
@@ -751,6 +751,20 @@ button:disabled {
     border-radius: 4px;
 }
 
+.warning-banner {
+    background: rgba(210, 153, 34, 0.1);
+    border: 1px solid var(--warning);
+    border-radius: 6px;
+    padding: 0.75rem 1rem;
+    margin-bottom: 1rem;
+    color: var(--warning);
+}
+
+.warning-banner ul {
+    margin: 0.25rem 0 0 1.5rem;
+    padding: 0;
+}
+
 .stats {
     display: flex;
     gap: 2rem;
diff --git a/src/web/static/js/managers/ResultsManager.js b/src/web/static/js/managers/ResultsManager.js
@@ -95,12 +95,25 @@ export class ResultsManager {
             return;
         }
 
+        // Display warnings (e.g., whitespace normalization)
+        let warningsHtml = '';
+        if (data.warnings && data.warnings.length > 0) {
+            warningsHtml = `
+                <div class="warning-banner">
+                    <strong>Warning:</strong>
+                    <ul>
+                        ${data.warnings.map(w => `<li>${escapeHtml(w)}</li>`).join('')}
+                    </ul>
+                </div>
+            `;
+        }
+
         // Filter by score threshold
         const filteredMatches = this.currentResults.filter(match =>
             match.score.composite >= config.scoreThreshold
         );
 
-        let html = `
+        let html = warningsHtml + `
             <div class="stats">
                 <div>Contigs: <span>${data.query.contig_count}</span></div>
                 <div>MD5 Coverage: <span>${(data.query.md5_coverage * 100).toFixed(0)}%</span></div>