Skip to content

Commit 5860832

Browse files
authored
Update classify_split_fastq.rs to indels and MNV
1 parent 197394f commit 5860832

1 file changed

Lines changed: 162 additions & 44 deletions

File tree

src/classify_split_fastq.rs

Lines changed: 162 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
//! This module now supports batch processing of multiple samples, either from
44
//! direct command-line input or from a TSV file. It automatically determines
55
//! sample names and generates separate reports for each, plus a final summary.
6+
//! It uses a dynamic k-mer engine to detect SNPs, MNVs, and Indels.
67
78
use crate::split_kmer;
8-
use anyhow::{anyhow, Context, Result};
9+
use anyhow::{anyhow, Result};
910
use clap::Parser;
1011
use log::{info, warn};
1112
use rayon::ThreadPoolBuilder;
@@ -16,30 +17,74 @@ use std::path::Path;
1617

1718
#[derive(Parser, Debug)]
1819
pub struct SplitFastqArgs {
19-
#[arg(short = 'i', long = "input", group = "input_method")] pub input: Vec<String>,
20-
#[arg(short = 'l', long = "input-list", group = "input_method")] pub input_list: Option<String>,
21-
#[arg(long)] pub paired: bool,
22-
#[arg(short = 'r', long, required = true)] pub reference: String,
23-
#[arg(short = 'm', long, required = true)] pub markers: String,
24-
#[arg(short = 't', long)] pub threads: Option<usize>,
25-
#[arg(short = 'o', long, default_value = "split")] pub output_prefix: String,
26-
#[arg(long, default_value_t = 10)] pub min_depth: u32,
27-
#[arg(long, default_value_t = 95)] pub min_alt_percent: u32,
20+
/// One or more FASTQ files to analyze (e.g., -i sample_R1.fq -i sample_R2.fq).
21+
/// Use this or --input-list.
22+
#[arg(short = 'i', long = "input", group = "input_method")]
23+
pub input: Vec<String>,
24+
25+
/// Path to a TSV file listing samples.
26+
/// Format: sample_name\tpath/to/R1.fq[\tpath/to/R2.fq].
27+
/// Use this or --input.
28+
#[arg(short = 'l', long = "input-list", group = "input_method")]
29+
pub input_list: Option<String>,
30+
31+
/// If specified, treats input files as paired-end reads, grouped in pairs.
32+
#[arg(long)]
33+
pub paired: bool,
34+
35+
/// Reference FASTA file used to define the markers.
36+
#[arg(short = 'r', long, required = true)]
37+
pub reference: String,
38+
39+
/// Path to a TSV file defining markers.
40+
/// Format: position\tREF\tALT\tmarker...
41+
#[arg(short = 'm', long, required = true)]
42+
pub markers: String,
43+
44+
/// Number of threads for parallel processing. Defaults to all available cores.
45+
#[arg(short = 't', long)]
46+
pub threads: Option<usize>,
47+
48+
/// Prefix for the output files.
49+
#[arg(short = 'o', long, default_value = "split")]
50+
pub output_prefix: String,
51+
52+
/// Minimum read depth required to call a variant at a marker position.
53+
#[arg(long, default_value_t = 10)]
54+
pub min_depth: u32,
55+
56+
/// Minimum frequency of the alternate allele to call a variant, as a percentage.
57+
#[arg(long, default_value_t = 95)]
58+
pub min_alt_percent: u32,
2859
}
2960

61+
/// Derives a clean sample name from a file path by taking the part before the first delimiter.
3062
fn derive_sample_name(path_str: &str) -> String {
31-
let path = Path::new(path_str);
32-
let file_stem = path.file_stem().unwrap_or_default().to_str().unwrap_or_default();
33-
file_stem.trim_end_matches(".R1").trim_end_matches(".R2").trim_end_matches("_R1").trim_end_matches("_R2").trim_end_matches("_1").trim_end_matches("_2").to_string()
63+
let file_stem = Path::new(path_str)
64+
.file_stem()
65+
.unwrap_or_default()
66+
.to_str()
67+
.unwrap_or_default();
68+
file_stem
69+
.split(|c| c == '_' || c == '.')
70+
.next()
71+
.unwrap_or(file_stem)
72+
.to_string()
3473
}
3574

75+
/// Reads a TSV file to get a map of sample names to their FASTQ file paths.
3676
fn read_sample_list(path: &str) -> Result<HashMap<String, Vec<String>>> {
3777
let mut samples = HashMap::new();
3878
let reader = fs::read_to_string(path)?;
3979
for line in reader.lines() {
40-
if line.trim().is_empty() { continue; }
80+
if line.trim().is_empty() {
81+
continue;
82+
}
4183
let fields: Vec<&str> = line.split('\t').collect();
42-
if fields.len() < 2 { warn!("Skipping malformed line in sample list: {}", line); continue; }
84+
if fields.len() < 2 {
85+
warn!("Skipping malformed line in sample list: {}", line);
86+
continue;
87+
}
4388
let sample_name = fields[0].to_string();
4489
let fastq_paths = fields[1..].iter().map(|s| s.to_string()).collect();
4590
samples.insert(sample_name, fastq_paths);
@@ -49,74 +94,147 @@ fn read_sample_list(path: &str) -> Result<HashMap<String, Vec<String>>> {
4994

5095
pub fn run(args: SplitFastqArgs) -> Result<()> {
5196
if args.input.is_empty() && args.input_list.is_none() {
52-
return Err(anyhow!("You must provide an input source: either --input or --input-list."));
97+
return Err(anyhow!(
98+
"You must provide an input source: either --input or --input-list."
99+
));
53100
}
54101

55-
rayon::ThreadPoolBuilder::new().num_threads(args.threads.unwrap_or(0)).build_global()?;
102+
ThreadPoolBuilder::new()
103+
.num_threads(args.threads.unwrap_or(0))
104+
.build_global()?;
56105

57-
info!("▶ Building split-k-mer table");
106+
info!("▶ Building dynamic marker database...");
58107
let markers = split_kmer::build_markers(&args.reference, &args.markers)?;
59-
info!(" Loaded {} markers", markers.len());
108+
info!(" Successfully generated {} dynamic markers.", markers.len());
60109

61110
let mut samples_to_process: HashMap<String, Vec<String>> = HashMap::new();
62111
if let Some(list_path) = &args.input_list {
63112
samples_to_process = read_sample_list(list_path)?;
113+
} else if args.paired {
114+
if args.input.len() % 2 != 0 {
115+
return Err(anyhow!(
116+
"--paired requires an even number of input files. Found {}.",
117+
args.input.len()
118+
));
119+
}
120+
for chunk in args.input.chunks(2) {
121+
let r1_path = &chunk[0];
122+
let r2_path = &chunk[1];
123+
let sample_name = derive_sample_name(r1_path);
124+
if sample_name != derive_sample_name(r2_path) {
125+
warn!(
126+
"Paired files may not belong to the same sample: {} and {}",
127+
r1_path, r2_path
128+
);
129+
}
130+
samples_to_process.insert(sample_name, vec![r1_path.clone(), r2_path.clone()]);
131+
}
64132
} else {
65133
for fq_path in &args.input {
66134
let sample_name = derive_sample_name(fq_path);
67-
samples_to_process.entry(sample_name).or_default().push(fq_path.clone());
135+
samples_to_process
136+
.entry(sample_name)
137+
.or_default()
138+
.push(fq_path.clone());
68139
}
69140
}
70141

142+
info!("Found {} sample(s) to process.", samples_to_process.len());
71143
let mut all_summary_lines = Vec::new();
144+
72145
for (sample_name, fastq_paths) in &samples_to_process {
73146
info!("▶ Processing sample: {}", sample_name);
74147
let counts = split_kmer::scan_fastq(fastq_paths, &markers)?;
75148
info!(" Finished scan for {}. Analyzing results...", sample_name);
76149

77-
let detailed_output_path = format!("{}_{}_mutations.tsv", args.output_prefix, sample_name);
150+
let detailed_output_path =
151+
format!("{}_{}_mutations.tsv", args.output_prefix, sample_name);
78152
let mut detailed_writer = fs::File::create(&detailed_output_path)?;
79-
writeln!(detailed_writer, "pos\tref\talt\tA\tC\tG\tT\tlineage")?;
153+
154+
let header = "pos\tref_allele\talt_allele\tref_count\talt_count\talt_fraction\tmarker\textra_annotations...";
155+
writeln!(detailed_writer, "{}", header)?;
80156

81157
let mut lineage_counts: HashMap<String, usize> = HashMap::new();
82-
83-
for (marker_id, base_counts) in counts.into_iter().enumerate() {
84-
let marker = &markers[marker_id];
85-
let coverage: u32 = base_counts.iter().sum();
86-
if coverage < args.min_depth { continue; }
87-
let alt_base_idx = match marker.alt_base {
88-
b'A' | b'a' => 0, b'C' | b'c' => 1, b'G' | b'g' => 2, b'T' | b't' => 3, _ => continue,
158+
for (marker_id, &[ref_count, alt_count]) in counts.iter().enumerate() {
159+
let coverage = ref_count + alt_count;
160+
if coverage < args.min_depth {
161+
continue;
162+
}
163+
let alt_fraction = if coverage > 0 {
164+
(alt_count as f32 / coverage as f32) * 100.0
165+
} else {
166+
0.0
89167
};
90-
let alt_count = base_counts[alt_base_idx];
91-
if alt_count * 100 < coverage * args.min_alt_percent { continue; }
92-
writeln!(detailed_writer, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", marker.pos + 1, marker.ref_base as char, marker.alt_base as char, base_counts[0], base_counts[1], base_counts[2], base_counts[3], marker.lineage)?;
93-
*lineage_counts.entry(marker.lineage.clone()).or_default() += 1;
168+
if alt_fraction < args.min_alt_percent as f32 {
169+
continue;
170+
}
171+
172+
let marker = &markers[marker_id];
173+
174+
let mut output_line = format!(
175+
"{}\t{}\t{}\t{}\t{}\t{:.2}\t{}",
176+
marker.pos + 1,
177+
marker.ref_allele,
178+
marker.alt_allele,
179+
ref_count,
180+
alt_count,
181+
alt_fraction,
182+
marker.lineage
183+
);
184+
185+
if !marker.annotations.is_empty() {
186+
output_line.push('\t');
187+
output_line.push_str(&marker.annotations.join("\t"));
188+
}
189+
writeln!(detailed_writer, "{}", output_line)?;
190+
191+
*lineage_counts
192+
.entry(marker.lineage.clone())
193+
.or_default() += 1;
94194
}
95-
info!(" Detailed report for {} written to {}", sample_name, detailed_output_path);
195+
info!(
196+
" Detailed report for {} written to {}",
197+
sample_name, detailed_output_path
198+
);
96199

97200
let summary_line = if lineage_counts.is_empty() {
98201
format!("{}\t\t", sample_name)
99202
} else {
100203
let mut sorted_lineages: Vec<(String, usize)> = lineage_counts.into_iter().collect();
101204
sorted_lineages.sort_by(|a, b| b.1.cmp(&a.1));
102-
let list_str = sorted_lineages.iter().map(|(l, c)| format!("{}:{}", l, c)).collect::<Vec<_>>().join(",");
103-
let major_lineage = if sorted_lineages.len() == 1 || sorted_lineages[0].1 > sorted_lineages[1].1 {
104-
sorted_lineages[0].0.clone()
105-
} else {
106-
let top_count = sorted_lineages[0].1;
107-
sorted_lineages.iter().filter(|(_, c)| *c == top_count).map(|(l, _)| l.clone()).collect::<Vec<_>>().join(",")
108-
};
205+
let list_str = sorted_lineages
206+
.iter()
207+
.map(|(l, c)| format!("{}:{}", l, c))
208+
.collect::<Vec<_>>()
209+
.join(",");
210+
let major_lineage =
211+
if sorted_lineages.len() == 1 || sorted_lineages[0].1 > sorted_lineages[1].1 {
212+
sorted_lineages[0].0.clone()
213+
} else {
214+
let top_count = sorted_lineages[0].1;
215+
sorted_lineages
216+
.iter()
217+
.filter(|(_, c)| *c == top_count)
218+
.map(|(l, _)| l.clone())
219+
.collect::<Vec<_>>()
220+
.join(",")
221+
};
109222
format!("{}\t{}\t{}", sample_name, list_str, major_lineage)
110223
};
111224
all_summary_lines.push(summary_line);
112225
}
113226

114227
let summary_path = format!("{}_summary.tsv", args.output_prefix);
115-
info!("▶ Writing final summary for all samples to {}", summary_path);
228+
info!(
229+
"▶ Writing final summary for all samples to {}",
230+
summary_path
231+
);
116232
let mut summary_writer = fs::File::create(&summary_path)?;
117233
writeln!(summary_writer, "genome\tlineage:count\tmajor_lineage")?;
118-
for line in all_summary_lines { writeln!(summary_writer, "{}", line)?; }
119-
234+
for line in all_summary_lines {
235+
writeln!(summary_writer, "{}", line)?;
236+
}
237+
120238
info!("✅ Process completed.");
121239
Ok(())
122240
}

0 commit comments

Comments
 (0)