1+ use std:: borrow:: Cow ;
12use std:: io:: BufReader ;
23use std:: path:: Path ;
34use thiserror:: Error ;
@@ -182,6 +183,83 @@ fn header_to_query(
182183 Ok ( query)
183184}
184185
186+ /// Normalize SAM header lines that use spaces instead of tabs.
187+ ///
188+ /// Browsers and copy-paste often convert tabs to spaces. This function detects
189+ /// SAM header lines (`@XX` prefix) where fields are space-separated instead of
190+ /// tab-separated and converts the spaces to tabs.
191+ ///
192+ /// Only normalizes lines that start with a SAM header record type (`@HD`, `@SQ`,
193+ /// `@RG`, `@PG`) followed by a space and a TAG: pattern. Lines that already
194+ /// contain tabs are left unchanged. `@CO` (comment) lines are not normalized
195+ /// because their content is free-form text where spaces are meaningful.
196+ ///
197+ /// Returns the (possibly normalized) text and a boolean indicating whether any
198+ /// normalization was performed. Uses `Cow` to avoid allocation when no
199+ /// normalization is needed (the common case for well-formed input).
200+ #[ must_use]
201+ pub fn normalize_sam_whitespace ( text : & str ) -> ( Cow < ' _ , str > , bool ) {
202+ // Fast path: check if any line needs normalization before allocating
203+ if !text. lines ( ) . any ( needs_space_to_tab_normalization) {
204+ return ( Cow :: Borrowed ( text) , false ) ;
205+ }
206+
207+ let mut normalized = String :: with_capacity ( text. len ( ) ) ;
208+
209+ for line in text. lines ( ) {
210+ if !normalized. is_empty ( ) {
211+ normalized. push ( '\n' ) ;
212+ }
213+
214+ if needs_space_to_tab_normalization ( line) {
215+ let mut first = true ;
216+ for field in line. split_whitespace ( ) {
217+ if first {
218+ normalized. push_str ( field) ;
219+ first = false ;
220+ } else {
221+ normalized. push ( '\t' ) ;
222+ normalized. push_str ( field) ;
223+ }
224+ }
225+ } else {
226+ normalized. push_str ( line) ;
227+ }
228+ }
229+
230+ // Preserve trailing newline if present
231+ if text. ends_with ( '\n' ) {
232+ normalized. push ( '\n' ) ;
233+ }
234+
235+ ( Cow :: Owned ( normalized) , true )
236+ }
237+
238+ /// Check if a line is a SAM header line that uses spaces instead of tabs.
239+ ///
240+ /// Returns true only when the line starts with a recognized SAM record type
241+ /// followed by whitespace and a TAG: pattern, and the line contains NO tab
242+ /// characters.
243+ fn needs_space_to_tab_normalization ( line : & str ) -> bool {
244+ if line. contains ( '\t' ) {
245+ return false ;
246+ }
247+
248+ // @CO lines are free-form comments — do not normalize
249+ let sam_prefixes = [ "@HD " , "@SQ " , "@RG " , "@PG " ] ;
250+ if !sam_prefixes. iter ( ) . any ( |p| line. starts_with ( p) ) {
251+ return false ;
252+ }
253+
254+ // Must have at least one TAG:VALUE pattern after the record type
255+ line. split_whitespace ( ) . skip ( 1 ) . any ( |field| {
256+ field. len ( ) >= 3
257+ && field. as_bytes ( ) . get ( 2 ) == Some ( & b':' )
258+ && field. as_bytes ( ) [ 0 ] . is_ascii_uppercase ( )
259+ && field. as_bytes ( ) [ 1 ] . is_ascii_uppercase ( )
260+ } )
261+ }
262+
185263/// Parse header from raw text (stdin or pasted)
186264///
187265/// # Errors
@@ -190,6 +268,8 @@ fn header_to_query(
190268/// required fields, or no contigs are found, or `ParseError::TooManyContigs`
191269/// if the limit is exceeded.
192270pub fn parse_header_text ( text : & str ) -> Result < QueryHeader , ParseError > {
271+ let ( normalized_text, _) = normalize_sam_whitespace ( text) ;
272+ let text = & normalized_text;
193273 let mut contigs = Vec :: new ( ) ;
194274
195275 for line in text. lines ( ) {
@@ -331,4 +411,86 @@ mod tests {
331411 ]
332412 ) ;
333413 }
414+
415+ #[ test]
416+ fn test_normalize_sam_whitespace_spaces_to_tabs ( ) {
417+ let input = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n " ;
418+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
419+ assert ! ( was_normalized) ;
420+ assert_eq ! (
421+ normalized,
422+ "@SQ\t SN:chr1\t LN:248956422\t M5:6aef897c3d6ff0c78aff06ac189178dd\n "
423+ ) ;
424+ }
425+
426+ #[ test]
427+ fn test_normalize_sam_whitespace_already_tabs ( ) {
428+ let input = "@SQ\t SN:chr1\t LN:248956422\n " ;
429+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
430+ assert ! ( !was_normalized) ;
431+ assert_eq ! ( normalized, input) ;
432+ }
433+
434+ #[ test]
435+ fn test_normalize_sam_whitespace_mixed_lines ( ) {
436+ let input =
437+ "@HD VN:1.6 SO:coordinate\n @SQ SN:chr1 LN:248956422\n @SQ SN:chr2 LN:242193529\n " ;
438+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
439+ assert ! ( was_normalized) ;
440+ assert ! ( normalized. contains( "@HD\t VN:1.6\t SO:coordinate" ) ) ;
441+ assert ! ( normalized. contains( "@SQ\t SN:chr1\t LN:248956422" ) ) ;
442+ assert ! ( normalized. contains( "@SQ\t SN:chr2\t LN:242193529" ) ) ;
443+ }
444+
445+ #[ test]
446+ fn test_normalize_sam_whitespace_multiple_spaces ( ) {
447+ let input = "@SQ SN:chr1 LN:248956422\n " ;
448+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
449+ assert ! ( was_normalized) ;
450+ assert_eq ! ( normalized, "@SQ\t SN:chr1\t LN:248956422\n " ) ;
451+ }
452+
453+ #[ test]
454+ fn test_normalize_sam_whitespace_preserves_non_header_lines ( ) {
455+ let input = "some random text with spaces\n @SQ SN:chr1 LN:100\n " ;
456+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
457+ assert ! ( was_normalized) ;
458+ assert ! ( normalized. starts_with( "some random text with spaces\n " ) ) ;
459+ assert ! ( normalized. contains( "@SQ\t SN:chr1\t LN:100" ) ) ;
460+ }
461+
462+ #[ test]
463+ fn test_normalize_sam_whitespace_tabs_and_spaces_mixed ( ) {
464+ // Line has some tabs and some spaces — leave it alone
465+ let input = "@SQ\t SN:chr1 LN:248956422\n " ;
466+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
467+ assert ! ( !was_normalized) ;
468+ assert_eq ! ( normalized, input) ;
469+ }
470+
471+ #[ test]
472+ fn test_normalize_sam_whitespace_skips_comment_lines ( ) {
473+ let input = "@CO This is a comment with VN:1.0 mentioned\n @SQ SN:chr1 LN:100\n " ;
474+ let ( normalized, was_normalized) = normalize_sam_whitespace ( input) ;
475+ assert ! ( was_normalized) ;
476+ // Comment line should be preserved as-is
477+ assert ! ( normalized. starts_with( "@CO This is a comment with VN:1.0 mentioned\n " ) ) ;
478+ assert ! ( normalized. contains( "@SQ\t SN:chr1\t LN:100" ) ) ;
479+ }
480+
481+ #[ test]
482+ fn test_parse_header_text_with_spaces ( ) {
483+ let header = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n \
484+ @SQ SN:chr2 LN:242193529\n ";
485+ let query = parse_header_text ( header) . unwrap ( ) ;
486+ assert_eq ! ( query. contigs. len( ) , 2 ) ;
487+ assert_eq ! ( query. contigs[ 0 ] . name, "chr1" ) ;
488+ assert_eq ! ( query. contigs[ 0 ] . length, 248_956_422 ) ;
489+ assert_eq ! (
490+ query. contigs[ 0 ] . md5,
491+ Some ( "6aef897c3d6ff0c78aff06ac189178dd" . to_string( ) )
492+ ) ;
493+ assert_eq ! ( query. contigs[ 1 ] . name, "chr2" ) ;
494+ assert_eq ! ( query. contigs[ 1 ] . length, 242_193_529 ) ;
495+ }
334496}
0 commit comments