33//! This module now supports batch processing of multiple samples, either from
44//! direct command-line input or from a TSV file. It automatically determines
55//! sample names and generates separate reports for each, plus a final summary.
6+ //! It uses a dynamic k-mer engine to detect SNPs, MNVs, and Indels.
67
78use crate :: split_kmer;
8- use anyhow:: { anyhow, Context , Result } ;
9+ use anyhow:: { anyhow, Result } ;
910use clap:: Parser ;
1011use log:: { info, warn} ;
1112use rayon:: ThreadPoolBuilder ;
@@ -16,30 +17,74 @@ use std::path::Path;
1617
1718#[ derive( Parser , Debug ) ]
1819pub struct SplitFastqArgs {
19- #[ arg( short = 'i' , long = "input" , group = "input_method" ) ] pub input : Vec < String > ,
20- #[ arg( short = 'l' , long = "input-list" , group = "input_method" ) ] pub input_list : Option < String > ,
21- #[ arg( long) ] pub paired : bool ,
22- #[ arg( short = 'r' , long, required = true ) ] pub reference : String ,
23- #[ arg( short = 'm' , long, required = true ) ] pub markers : String ,
24- #[ arg( short = 't' , long) ] pub threads : Option < usize > ,
25- #[ arg( short = 'o' , long, default_value = "split" ) ] pub output_prefix : String ,
26- #[ arg( long, default_value_t = 10 ) ] pub min_depth : u32 ,
27- #[ arg( long, default_value_t = 95 ) ] pub min_alt_percent : u32 ,
20+ /// One or more FASTQ files to analyze (e.g., -i sample_R1.fq -i sample_R2.fq).
21+ /// Use this or --input-list.
22+ #[ arg( short = 'i' , long = "input" , group = "input_method" ) ]
23+ pub input : Vec < String > ,
24+
25+ /// Path to a TSV file listing samples.
26+ /// Format: sample_name\tpath/to/R1.fq[\tpath/to/R2.fq].
27+ /// Use this or --input.
28+ #[ arg( short = 'l' , long = "input-list" , group = "input_method" ) ]
29+ pub input_list : Option < String > ,
30+
31+ /// If specified, treats input files as paired-end reads, grouped in pairs.
32+ #[ arg( long) ]
33+ pub paired : bool ,
34+
35+ /// Reference FASTA file used to define the markers.
36+ #[ arg( short = 'r' , long, required = true ) ]
37+ pub reference : String ,
38+
39+ /// Path to a TSV file defining markers.
40+ /// Format: position\tREF\tALT\tmarker...
41+ #[ arg( short = 'm' , long, required = true ) ]
42+ pub markers : String ,
43+
44+ /// Number of threads for parallel processing. Defaults to all available cores.
45+ #[ arg( short = 't' , long) ]
46+ pub threads : Option < usize > ,
47+
48+ /// Prefix for the output files.
49+ #[ arg( short = 'o' , long, default_value = "split" ) ]
50+ pub output_prefix : String ,
51+
52+ /// Minimum read depth required to call a variant at a marker position.
53+ #[ arg( long, default_value_t = 10 ) ]
54+ pub min_depth : u32 ,
55+
56+ /// Minimum frequency of the alternate allele to call a variant, as a percentage.
57+ #[ arg( long, default_value_t = 95 ) ]
58+ pub min_alt_percent : u32 ,
2859}
2960
61+ /// Derives a clean sample name from a file path by taking the part before the first delimiter.
3062fn derive_sample_name ( path_str : & str ) -> String {
31- let path = Path :: new ( path_str) ;
32- let file_stem = path. file_stem ( ) . unwrap_or_default ( ) . to_str ( ) . unwrap_or_default ( ) ;
33- file_stem. trim_end_matches ( ".R1" ) . trim_end_matches ( ".R2" ) . trim_end_matches ( "_R1" ) . trim_end_matches ( "_R2" ) . trim_end_matches ( "_1" ) . trim_end_matches ( "_2" ) . to_string ( )
63+ let file_stem = Path :: new ( path_str)
64+ . file_stem ( )
65+ . unwrap_or_default ( )
66+ . to_str ( )
67+ . unwrap_or_default ( ) ;
68+ file_stem
69+ . split ( |c| c == '_' || c == '.' )
70+ . next ( )
71+ . unwrap_or ( file_stem)
72+ . to_string ( )
3473}
3574
75+ /// Reads a TSV file to get a map of sample names to their FASTQ file paths.
3676fn read_sample_list ( path : & str ) -> Result < HashMap < String , Vec < String > > > {
3777 let mut samples = HashMap :: new ( ) ;
3878 let reader = fs:: read_to_string ( path) ?;
3979 for line in reader. lines ( ) {
40- if line. trim ( ) . is_empty ( ) { continue ; }
80+ if line. trim ( ) . is_empty ( ) {
81+ continue ;
82+ }
4183 let fields: Vec < & str > = line. split ( '\t' ) . collect ( ) ;
42- if fields. len ( ) < 2 { warn ! ( "Skipping malformed line in sample list: {}" , line) ; continue ; }
84+ if fields. len ( ) < 2 {
85+ warn ! ( "Skipping malformed line in sample list: {}" , line) ;
86+ continue ;
87+ }
4388 let sample_name = fields[ 0 ] . to_string ( ) ;
4489 let fastq_paths = fields[ 1 ..] . iter ( ) . map ( |s| s. to_string ( ) ) . collect ( ) ;
4590 samples. insert ( sample_name, fastq_paths) ;
@@ -49,74 +94,147 @@ fn read_sample_list(path: &str) -> Result<HashMap<String, Vec<String>>> {
4994
5095pub fn run ( args : SplitFastqArgs ) -> Result < ( ) > {
5196 if args. input . is_empty ( ) && args. input_list . is_none ( ) {
52- return Err ( anyhow ! ( "You must provide an input source: either --input or --input-list." ) ) ;
97+ return Err ( anyhow ! (
98+ "You must provide an input source: either --input or --input-list."
99+ ) ) ;
53100 }
54101
55- rayon:: ThreadPoolBuilder :: new ( ) . num_threads ( args. threads . unwrap_or ( 0 ) ) . build_global ( ) ?;
102+ ThreadPoolBuilder :: new ( )
103+ . num_threads ( args. threads . unwrap_or ( 0 ) )
104+ . build_global ( ) ?;
56105
57- info ! ( "▶ Building split-k-mer table " ) ;
106+ info ! ( "▶ Building dynamic marker database... " ) ;
58107 let markers = split_kmer:: build_markers ( & args. reference , & args. markers ) ?;
59- info ! ( " Loaded {} markers" , markers. len( ) ) ;
108+ info ! ( " Successfully generated {} dynamic markers. " , markers. len( ) ) ;
60109
61110 let mut samples_to_process: HashMap < String , Vec < String > > = HashMap :: new ( ) ;
62111 if let Some ( list_path) = & args. input_list {
63112 samples_to_process = read_sample_list ( list_path) ?;
113+ } else if args. paired {
114+ if args. input . len ( ) % 2 != 0 {
115+ return Err ( anyhow ! (
116+ "--paired requires an even number of input files. Found {}." ,
117+ args. input. len( )
118+ ) ) ;
119+ }
120+ for chunk in args. input . chunks ( 2 ) {
121+ let r1_path = & chunk[ 0 ] ;
122+ let r2_path = & chunk[ 1 ] ;
123+ let sample_name = derive_sample_name ( r1_path) ;
124+ if sample_name != derive_sample_name ( r2_path) {
125+ warn ! (
126+ "Paired files may not belong to the same sample: {} and {}" ,
127+ r1_path, r2_path
128+ ) ;
129+ }
130+ samples_to_process. insert ( sample_name, vec ! [ r1_path. clone( ) , r2_path. clone( ) ] ) ;
131+ }
64132 } else {
65133 for fq_path in & args. input {
66134 let sample_name = derive_sample_name ( fq_path) ;
67- samples_to_process. entry ( sample_name) . or_default ( ) . push ( fq_path. clone ( ) ) ;
135+ samples_to_process
136+ . entry ( sample_name)
137+ . or_default ( )
138+ . push ( fq_path. clone ( ) ) ;
68139 }
69140 }
70141
142+ info ! ( "Found {} sample(s) to process." , samples_to_process. len( ) ) ;
71143 let mut all_summary_lines = Vec :: new ( ) ;
144+
72145 for ( sample_name, fastq_paths) in & samples_to_process {
73146 info ! ( "▶ Processing sample: {}" , sample_name) ;
74147 let counts = split_kmer:: scan_fastq ( fastq_paths, & markers) ?;
75148 info ! ( " Finished scan for {}. Analyzing results..." , sample_name) ;
76149
77- let detailed_output_path = format ! ( "{}_{}_mutations.tsv" , args. output_prefix, sample_name) ;
150+ let detailed_output_path =
151+ format ! ( "{}_{}_mutations.tsv" , args. output_prefix, sample_name) ;
78152 let mut detailed_writer = fs:: File :: create ( & detailed_output_path) ?;
79- writeln ! ( detailed_writer, "pos\t ref\t alt\t A\t C\t G\t T\t lineage" ) ?;
153+
154+ let header = "pos\t ref_allele\t alt_allele\t ref_count\t alt_count\t alt_fraction\t marker\t extra_annotations..." ;
155+ writeln ! ( detailed_writer, "{}" , header) ?;
80156
81157 let mut lineage_counts: HashMap < String , usize > = HashMap :: new ( ) ;
82-
83- for ( marker_id, base_counts) in counts. into_iter ( ) . enumerate ( ) {
84- let marker = & markers[ marker_id] ;
85- let coverage: u32 = base_counts. iter ( ) . sum ( ) ;
86- if coverage < args. min_depth { continue ; }
87- let alt_base_idx = match marker. alt_base {
88- b'A' | b'a' => 0 , b'C' | b'c' => 1 , b'G' | b'g' => 2 , b'T' | b't' => 3 , _ => continue ,
158+ for ( marker_id, & [ ref_count, alt_count] ) in counts. iter ( ) . enumerate ( ) {
159+ let coverage = ref_count + alt_count;
160+ if coverage < args. min_depth {
161+ continue ;
162+ }
163+ let alt_fraction = if coverage > 0 {
164+ ( alt_count as f32 / coverage as f32 ) * 100.0
165+ } else {
166+ 0.0
89167 } ;
90- let alt_count = base_counts[ alt_base_idx] ;
91- if alt_count * 100 < coverage * args. min_alt_percent { continue ; }
92- writeln ! ( detailed_writer, "{}\t {}\t {}\t {}\t {}\t {}\t {}\t {}" , marker. pos + 1 , marker. ref_base as char , marker. alt_base as char , base_counts[ 0 ] , base_counts[ 1 ] , base_counts[ 2 ] , base_counts[ 3 ] , marker. lineage) ?;
93- * lineage_counts. entry ( marker. lineage . clone ( ) ) . or_default ( ) += 1 ;
168+ if alt_fraction < args. min_alt_percent as f32 {
169+ continue ;
170+ }
171+
172+ let marker = & markers[ marker_id] ;
173+
174+ let mut output_line = format ! (
175+ "{}\t {}\t {}\t {}\t {}\t {:.2}\t {}" ,
176+ marker. pos + 1 ,
177+ marker. ref_allele,
178+ marker. alt_allele,
179+ ref_count,
180+ alt_count,
181+ alt_fraction,
182+ marker. lineage
183+ ) ;
184+
185+ if !marker. annotations . is_empty ( ) {
186+ output_line. push ( '\t' ) ;
187+ output_line. push_str ( & marker. annotations . join ( "\t " ) ) ;
188+ }
189+ writeln ! ( detailed_writer, "{}" , output_line) ?;
190+
191+ * lineage_counts
192+ . entry ( marker. lineage . clone ( ) )
193+ . or_default ( ) += 1 ;
94194 }
95- info ! ( " Detailed report for {} written to {}" , sample_name, detailed_output_path) ;
195+ info ! (
196+ " Detailed report for {} written to {}" ,
197+ sample_name, detailed_output_path
198+ ) ;
96199
97200 let summary_line = if lineage_counts. is_empty ( ) {
98201 format ! ( "{}\t \t " , sample_name)
99202 } else {
100203 let mut sorted_lineages: Vec < ( String , usize ) > = lineage_counts. into_iter ( ) . collect ( ) ;
101204 sorted_lineages. sort_by ( |a, b| b. 1 . cmp ( & a. 1 ) ) ;
102- let list_str = sorted_lineages. iter ( ) . map ( |( l, c) | format ! ( "{}:{}" , l, c) ) . collect :: < Vec < _ > > ( ) . join ( "," ) ;
103- let major_lineage = if sorted_lineages. len ( ) == 1 || sorted_lineages[ 0 ] . 1 > sorted_lineages[ 1 ] . 1 {
104- sorted_lineages[ 0 ] . 0 . clone ( )
105- } else {
106- let top_count = sorted_lineages[ 0 ] . 1 ;
107- sorted_lineages. iter ( ) . filter ( |( _, c) | * c == top_count) . map ( |( l, _) | l. clone ( ) ) . collect :: < Vec < _ > > ( ) . join ( "," )
108- } ;
205+ let list_str = sorted_lineages
206+ . iter ( )
207+ . map ( |( l, c) | format ! ( "{}:{}" , l, c) )
208+ . collect :: < Vec < _ > > ( )
209+ . join ( "," ) ;
210+ let major_lineage =
211+ if sorted_lineages. len ( ) == 1 || sorted_lineages[ 0 ] . 1 > sorted_lineages[ 1 ] . 1 {
212+ sorted_lineages[ 0 ] . 0 . clone ( )
213+ } else {
214+ let top_count = sorted_lineages[ 0 ] . 1 ;
215+ sorted_lineages
216+ . iter ( )
217+ . filter ( |( _, c) | * c == top_count)
218+ . map ( |( l, _) | l. clone ( ) )
219+ . collect :: < Vec < _ > > ( )
220+ . join ( "," )
221+ } ;
109222 format ! ( "{}\t {}\t {}" , sample_name, list_str, major_lineage)
110223 } ;
111224 all_summary_lines. push ( summary_line) ;
112225 }
113226
114227 let summary_path = format ! ( "{}_summary.tsv" , args. output_prefix) ;
115- info ! ( "▶ Writing final summary for all samples to {}" , summary_path) ;
228+ info ! (
229+ "▶ Writing final summary for all samples to {}" ,
230+ summary_path
231+ ) ;
116232 let mut summary_writer = fs:: File :: create ( & summary_path) ?;
117233 writeln ! ( summary_writer, "genome\t lineage:count\t major_lineage" ) ?;
118- for line in all_summary_lines { writeln ! ( summary_writer, "{}" , line) ?; }
119-
234+ for line in all_summary_lines {
235+ writeln ! ( summary_writer, "{}" , line) ?;
236+ }
237+
120238 info ! ( "✅ Process completed." ) ;
121239 Ok ( ( ) )
122240}
0 commit comments