Skip to content

Commit 37d1db7

Browse files
authored
feat: normalize space-separated SAM headers in web UI (#13)
* feat: normalize space-separated SAM header lines to tabs * feat: detect space-separated SAM headers in format detection * feat: propagate whitespace normalization warning in API response * feat: display whitespace normalization warning in web UI * refactor: use Cow to avoid allocation, extract is_sam_record helper, move warning styles to CSS
1 parent fea7cbb commit 37d1db7

5 files changed

Lines changed: 247 additions & 10 deletions

File tree

src/parsing/sam.rs

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use std::borrow::Cow;
12
use std::io::BufReader;
23
use std::path::Path;
34
use thiserror::Error;
@@ -182,6 +183,83 @@ fn header_to_query(
182183
Ok(query)
183184
}
184185

186+
/// Normalize SAM header lines that use spaces instead of tabs.
187+
///
188+
/// Browsers and copy-paste often convert tabs to spaces. This function detects
189+
/// SAM header lines (`@XX` prefix) where fields are space-separated instead of
190+
/// tab-separated and converts the spaces to tabs.
191+
///
192+
/// Only normalizes lines that start with a SAM header record type (`@HD`, `@SQ`,
193+
/// `@RG`, `@PG`) followed by a space and a TAG: pattern. Lines that already
194+
/// contain tabs are left unchanged. `@CO` (comment) lines are not normalized
195+
/// because their content is free-form text where spaces are meaningful.
196+
///
197+
/// Returns the (possibly normalized) text and a boolean indicating whether any
198+
/// normalization was performed. Uses `Cow` to avoid allocation when no
199+
/// normalization is needed (the common case for well-formed input).
200+
#[must_use]
201+
pub fn normalize_sam_whitespace(text: &str) -> (Cow<'_, str>, bool) {
202+
// Fast path: check if any line needs normalization before allocating
203+
if !text.lines().any(needs_space_to_tab_normalization) {
204+
return (Cow::Borrowed(text), false);
205+
}
206+
207+
let mut normalized = String::with_capacity(text.len());
208+
209+
for line in text.lines() {
210+
if !normalized.is_empty() {
211+
normalized.push('\n');
212+
}
213+
214+
if needs_space_to_tab_normalization(line) {
215+
let mut first = true;
216+
for field in line.split_whitespace() {
217+
if first {
218+
normalized.push_str(field);
219+
first = false;
220+
} else {
221+
normalized.push('\t');
222+
normalized.push_str(field);
223+
}
224+
}
225+
} else {
226+
normalized.push_str(line);
227+
}
228+
}
229+
230+
// Preserve trailing newline if present
231+
if text.ends_with('\n') {
232+
normalized.push('\n');
233+
}
234+
235+
(Cow::Owned(normalized), true)
236+
}
237+
238+
/// Check if a line is a SAM header line that uses spaces instead of tabs.
239+
///
240+
/// Returns true only when the line starts with a recognized SAM record type
241+
/// followed by whitespace and a TAG: pattern, and the line contains NO tab
242+
/// characters.
243+
fn needs_space_to_tab_normalization(line: &str) -> bool {
244+
if line.contains('\t') {
245+
return false;
246+
}
247+
248+
// @CO lines are free-form comments — do not normalize
249+
let sam_prefixes = ["@HD ", "@SQ ", "@RG ", "@PG "];
250+
if !sam_prefixes.iter().any(|p| line.starts_with(p)) {
251+
return false;
252+
}
253+
254+
// Must have at least one TAG:VALUE pattern after the record type
255+
line.split_whitespace().skip(1).any(|field| {
256+
field.len() >= 3
257+
&& field.as_bytes().get(2) == Some(&b':')
258+
&& field.as_bytes()[0].is_ascii_uppercase()
259+
&& field.as_bytes()[1].is_ascii_uppercase()
260+
})
261+
}
262+
185263
/// Parse header from raw text (stdin or pasted)
186264
///
187265
/// # Errors
@@ -190,6 +268,8 @@ fn header_to_query(
190268
/// required fields, or no contigs are found, or `ParseError::TooManyContigs`
191269
/// if the limit is exceeded.
192270
pub fn parse_header_text(text: &str) -> Result<QueryHeader, ParseError> {
271+
let (normalized_text, _) = normalize_sam_whitespace(text);
272+
let text = &normalized_text;
193273
let mut contigs = Vec::new();
194274

195275
for line in text.lines() {
@@ -331,4 +411,86 @@ mod tests {
331411
]
332412
);
333413
}
414+
415+
#[test]
416+
fn test_normalize_sam_whitespace_spaces_to_tabs() {
417+
let input = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
418+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
419+
assert!(was_normalized);
420+
assert_eq!(
421+
normalized,
422+
"@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n"
423+
);
424+
}
425+
426+
#[test]
427+
fn test_normalize_sam_whitespace_already_tabs() {
428+
let input = "@SQ\tSN:chr1\tLN:248956422\n";
429+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
430+
assert!(!was_normalized);
431+
assert_eq!(normalized, input);
432+
}
433+
434+
#[test]
435+
fn test_normalize_sam_whitespace_mixed_lines() {
436+
let input =
437+
"@HD VN:1.6 SO:coordinate\n@SQ SN:chr1 LN:248956422\n@SQ SN:chr2 LN:242193529\n";
438+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
439+
assert!(was_normalized);
440+
assert!(normalized.contains("@HD\tVN:1.6\tSO:coordinate"));
441+
assert!(normalized.contains("@SQ\tSN:chr1\tLN:248956422"));
442+
assert!(normalized.contains("@SQ\tSN:chr2\tLN:242193529"));
443+
}
444+
445+
#[test]
446+
fn test_normalize_sam_whitespace_multiple_spaces() {
447+
let input = "@SQ SN:chr1 LN:248956422\n";
448+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
449+
assert!(was_normalized);
450+
assert_eq!(normalized, "@SQ\tSN:chr1\tLN:248956422\n");
451+
}
452+
453+
#[test]
454+
fn test_normalize_sam_whitespace_preserves_non_header_lines() {
455+
let input = "some random text with spaces\n@SQ SN:chr1 LN:100\n";
456+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
457+
assert!(was_normalized);
458+
assert!(normalized.starts_with("some random text with spaces\n"));
459+
assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
460+
}
461+
462+
#[test]
463+
fn test_normalize_sam_whitespace_tabs_and_spaces_mixed() {
464+
// Line has some tabs and some spaces — leave it alone
465+
let input = "@SQ\tSN:chr1 LN:248956422\n";
466+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
467+
assert!(!was_normalized);
468+
assert_eq!(normalized, input);
469+
}
470+
471+
#[test]
472+
fn test_normalize_sam_whitespace_skips_comment_lines() {
473+
let input = "@CO This is a comment with VN:1.0 mentioned\n@SQ SN:chr1 LN:100\n";
474+
let (normalized, was_normalized) = normalize_sam_whitespace(input);
475+
assert!(was_normalized);
476+
// Comment line should be preserved as-is
477+
assert!(normalized.starts_with("@CO This is a comment with VN:1.0 mentioned\n"));
478+
assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
479+
}
480+
481+
#[test]
482+
fn test_parse_header_text_with_spaces() {
483+
let header = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n\
484+
@SQ SN:chr2 LN:242193529\n";
485+
let query = parse_header_text(header).unwrap();
486+
assert_eq!(query.contigs.len(), 2);
487+
assert_eq!(query.contigs[0].name, "chr1");
488+
assert_eq!(query.contigs[0].length, 248_956_422);
489+
assert_eq!(
490+
query.contigs[0].md5,
491+
Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
492+
);
493+
assert_eq!(query.contigs[1].name, "chr2");
494+
assert_eq!(query.contigs[1].length, 242_193_529);
495+
}
334496
}

src/web/format_detection.rs

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,17 @@ fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
135135
}
136136
}
137137

138+
/// Check if a line starts with a SAM record type prefix followed by a tab or space.
139+
///
140+
/// SAM headers use tabs as delimiters, but copy-pasted text often has spaces instead.
141+
fn is_sam_record(line: &str, prefix: &str) -> bool {
142+
line.starts_with(prefix)
143+
&& line
144+
.as_bytes()
145+
.get(prefix.len())
146+
.is_some_and(|&b| b == b'\t' || b == b' ')
147+
}
148+
138149
/// Detect format from file content analysis
139150
fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
140151
let content_trimmed = content.trim();
@@ -155,14 +166,14 @@ fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError>
155166
let lines: Vec<&str> = content_trimmed.lines().take(20).collect(); // Sample first 20 lines
156167

157168
// Picard dictionary: starts with @HD and has @SQ lines (check BEFORE Sam)
158-
if lines.iter().any(|line| line.starts_with("@HD\t"))
159-
&& lines.iter().any(|line| line.starts_with("@SQ\t"))
169+
if lines.iter().any(|line| is_sam_record(line, "@HD"))
170+
&& lines.iter().any(|line| is_sam_record(line, "@SQ"))
160171
{
161172
return Ok(FileFormat::Dict);
162173
}
163174

164175
// SAM header format: starts with @SQ lines
165-
if lines.iter().any(|line| line.starts_with("@SQ\t")) {
176+
if lines.iter().any(|line| is_sam_record(line, "@SQ")) {
166177
return Ok(FileFormat::Sam);
167178
}
168179

@@ -555,6 +566,26 @@ mod tests {
555566
assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
556567
}
557568

569+
#[test]
570+
fn test_sam_header_detection_with_spaces() {
571+
let content = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
572+
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
573+
}
574+
575+
#[test]
576+
fn test_dict_detection_with_spaces() {
577+
let content = "@HD VN:1.0 SO:coordinate\n@SQ SN:chr1 LN:248956422\n";
578+
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
579+
}
580+
581+
#[test]
582+
fn test_sam_validation_with_spaces() {
583+
assert!(validate_format_content(
584+
"@SQ SN:chr1 LN:123",
585+
&FileFormat::Sam
586+
));
587+
}
588+
558589
#[test]
559590
fn test_combined_detection() {
560591
let content = "@SQ\tSN:chr1\tLN:248956422\n";

src/web/server.rs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,8 @@ async fn identify_handler(
327327
};
328328

329329
// Parse input using intelligent format detection
330-
let query = match parse_input_data(&input_data) {
331-
Ok(query) => query,
330+
let (query, parse_warnings) = match parse_input_data(&input_data) {
331+
Ok(result) => result,
332332
Err(error_response) => return *error_response,
333333
};
334334

@@ -437,6 +437,7 @@ async fn identify_handler(
437437
"md5_coverage": query.md5_coverage(),
438438
"naming_convention": format!("{:?}", query.naming_convention),
439439
},
440+
"warnings": parse_warnings,
440441
"matches": results,
441442
"processing_info": {
442443
"detected_format": input_data.format.as_ref().map_or("unknown", super::format_detection::FileFormat::display_name),
@@ -975,11 +976,27 @@ async fn extract_request_data(
975976
Ok((input_data, config))
976977
}
977978

978-
/// Parse input data using intelligent format detection
979+
/// Parse input data using intelligent format detection.
980+
///
981+
/// Returns the parsed query header and a list of warnings (e.g. whitespace normalization).
979982
fn parse_input_data(
980983
input_data: &InputData,
981-
) -> Result<crate::core::header::QueryHeader, Box<Response>> {
984+
) -> Result<(crate::core::header::QueryHeader, Vec<String>), Box<Response>> {
985+
let mut warnings: Vec<String> = Vec::new();
986+
982987
if let Some(text_content) = &input_data.text_content {
988+
// Normalize space-separated SAM headers before detection and parsing
989+
let (normalized_content, was_normalized) =
990+
crate::parsing::sam::normalize_sam_whitespace(text_content);
991+
if was_normalized {
992+
warnings.push(
993+
"Input contained spaces instead of tabs between SAM header fields. \
994+
Fields were automatically converted to tab-separated format."
995+
.to_string(),
996+
);
997+
}
998+
let text_content = &normalized_content;
999+
9831000
// Text-based parsing with format detection
9841001
let Ok(detected_format) = detect_format(text_content, input_data.filename.as_deref())
9851002
else {
@@ -997,7 +1014,7 @@ fn parse_input_data(
9971014
};
9981015

9991016
match parse_with_format(text_content, detected_format) {
1000-
Ok(query) => Ok(query),
1017+
Ok(query) => Ok((query, warnings)),
10011018
Err(_) => Err(Box::new((
10021019
StatusCode::BAD_REQUEST,
10031020
Json(create_safe_error_response(
@@ -1013,7 +1030,7 @@ fn parse_input_data(
10131030
let format = input_data.format.unwrap_or(FileFormat::Bam);
10141031

10151032
match parse_binary_file(binary_content, format) {
1016-
Ok(query) => Ok(query),
1033+
Ok(query) => Ok((query, Vec::new())),
10171034
Err(_) => Err(Box::new((
10181035
StatusCode::BAD_REQUEST,
10191036
Json(create_safe_error_response(

src/web/static/css/styles.css

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,20 @@ button:disabled {
751751
border-radius: 4px;
752752
}
753753

754+
.warning-banner {
755+
background: rgba(210, 153, 34, 0.1);
756+
border: 1px solid var(--warning);
757+
border-radius: 6px;
758+
padding: 0.75rem 1rem;
759+
margin-bottom: 1rem;
760+
color: var(--warning);
761+
}
762+
763+
.warning-banner ul {
764+
margin: 0.25rem 0 0 1.5rem;
765+
padding: 0;
766+
}
767+
754768
.stats {
755769
display: flex;
756770
gap: 2rem;

src/web/static/js/managers/ResultsManager.js

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,25 @@ export class ResultsManager {
9595
return;
9696
}
9797

98+
// Display warnings (e.g., whitespace normalization)
99+
let warningsHtml = '';
100+
if (data.warnings && data.warnings.length > 0) {
101+
warningsHtml = `
102+
<div class="warning-banner">
103+
<strong>Warning:</strong>
104+
<ul>
105+
${data.warnings.map(w => `<li>${escapeHtml(w)}</li>`).join('')}
106+
</ul>
107+
</div>
108+
`;
109+
}
110+
98111
// Filter by score threshold
99112
const filteredMatches = this.currentResults.filter(match =>
100113
match.score.composite >= config.scoreThreshold
101114
);
102115

103-
let html = `
116+
let html = warningsHtml + `
104117
<div class="stats">
105118
<div>Contigs: <span>${data.query.contig_count}</span></div>
106119
<div>MD5 Coverage: <span>${(data.query.md5_coverage * 100).toFixed(0)}%</span></div>

0 commit comments

Comments
 (0)