Skip to content

Commit f3c9dc7

Browse files
authored
add match
1 parent 769e104 commit f3c9dc7

2 files changed

Lines changed: 226 additions & 1 deletion

File tree

src/main.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ mod classify;
2020
mod classify_split_fastq;
2121
mod common;
2222
mod errors;
23+
// MODIFIED: Add the new 'match' module. Use r# to avoid keyword conflict.
24+
mod r#match;
2325
mod predict;
2426
mod split_kmer;
2527
mod train;
@@ -32,7 +34,7 @@ use errors::AppResult;
3234
#[derive(Parser)]
3335
#[command(
3436
name = "pathotypr",
35-
version = "0.1.0",
37+
version = "0.2.0",
3638
author = "Paula Ruiz Rodriguez",
3739
about = "A versatile toolkit for genome classification and variant genotyping."
3840
)]
@@ -55,6 +57,9 @@ enum Commands {
5557
Classify(classify::Args),
5658
/// Perform split-k-mer typing directly from FASTQ files.
5759
SplitFastq(classify_split_fastq::SplitFastqArgs),
60+
// MODIFIED: Add the 'match' subcommand.
61+
/// Find the best matching reference for a set of FASTQ reads.
62+
Match(r#match::Args),
5863
}
5964

6065
/* ----------------- Logger Initialization ----------------- */
@@ -100,6 +105,8 @@ fn main() -> AppResult<()> {
100105
Commands::Predict(a) => predict::run(a)?,
101106
Commands::Classify(a) => classify::run(a)?,
102107
Commands::SplitFastq(a) => classify_split_fastq::run(a)?,
108+
// MODIFIED: Add the handler for the 'match' subcommand.
109+
Commands::Match(a) => r#match::run(a)?,
103110
}
104111
Ok(())
105112
}

src/match.rs

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
//! src/match.rs
2+
//!
3+
//! Implements the `match` subcommand.
4+
//!
5+
//! This module takes input FASTQ reads and compares them against a collection
6+
//! of reference genomes provided in a single multi-FASTA file. It uses an
7+
//! efficient, k-mer-based weighted containment score to determine the best
8+
//! matching reference genome for the sample.
9+
10+
use crate::errors::{AppError, AppResult};
11+
use clap::Parser;
12+
use indicatif::{ProgressBar, ProgressStyle};
13+
use log::{debug, info, warn};
14+
use needletail::{parse_fastx_file, Sequence};
15+
use rayon::prelude::*;
16+
use rustc_hash::{FxHashMap, FxHashSet};
17+
use std::fs;
18+
use std::fs::File;
19+
use std::io::Write;
20+
21+
/// Command-line arguments for the `match` subcommand.
22+
#[derive(Parser, Debug)]
23+
pub struct Args {
24+
/// Path to one or more input FASTQ files (can be gzipped). Use this or --input-list.
25+
#[arg(short = 'i', long = "input", num_args = 1.., group = "input_method")]
26+
pub fastqs: Vec<String>,
27+
28+
/// Path to a TSV file listing FASTQ files to process.
29+
/// Format: sample_name\t/path/to/reads1.fastq[\t/path/to/reads2.fastq]
30+
/// Use this or --input.
31+
#[arg(short = 'l', long = "input-list", group = "input_method")]
32+
pub input_list: Option<String>,
33+
34+
/// Path to a single multi-FASTA file containing all reference genomes.
35+
#[arg(short = 'r', long = "references", required = true)]
36+
pub references: String,
37+
38+
/// Path for the output TSV report. Prints to console if not provided.
39+
#[arg(short = 'o', long)]
40+
pub output: Option<String>,
41+
42+
/// k-mer size for comparison.
43+
#[arg(short = 'k', long, default_value_t = 31)]
44+
pub kmer_size: u8,
45+
46+
/// Number of CPU threads to use. Defaults to all available.
47+
#[arg(short = 't', long)]
48+
pub threads: Option<usize>,
49+
}
50+
51+
/// Reads a list of FASTQ file paths from a TSV file.
52+
fn read_fastq_list_from_tsv(path: &str) -> AppResult<Vec<String>> {
53+
info!("Reading FASTQ file list from TSV: {}", path);
54+
let mut fastq_paths = Vec::new();
55+
let content = fs::read_to_string(path)?;
56+
for line in content.lines() {
57+
if line.trim().is_empty() || line.starts_with('#') {
58+
continue;
59+
}
60+
// The format can be sample_name\tpath1\tpath2...
61+
// We collect all paths from the second column onwards.
62+
let paths: Vec<String> = line.split('\t').skip(1).map(String::from).collect();
63+
if paths.is_empty() {
64+
warn!("Skipping malformed line in TSV (no paths found): {}", line);
65+
}
66+
fastq_paths.extend(paths);
67+
}
68+
if fastq_paths.is_empty() {
69+
return Err(AppError::NotEnoughData(format!("No FASTQ files found in the TSV list: {}", path)));
70+
}
71+
Ok(fastq_paths)
72+
}
73+
74+
75+
/// Counts k-mer occurrences from a set of FASTQ files in parallel.
76+
fn count_kmers_from_fastqs(paths: &[String], k: u8) -> AppResult<FxHashMap<u64, u32>> {
77+
info!("🧬 Counting k-mers from input FASTQ files...");
78+
let pb = ProgressBar::new(paths.len() as u64);
79+
pb.set_style(
80+
ProgressStyle::default_bar()
81+
.template(" {spinner:.blue} Reading FASTQs: [{bar:40.cyan/blue}] {pos}/{len} {msg}")?
82+
.progress_chars("=>-"),
83+
);
84+
85+
let kmer_counts = paths
86+
.par_iter()
87+
.map(|path| {
88+
let mut local_counts = FxHashMap::default();
89+
let mut reader = parse_fastx_file(path)
90+
.unwrap_or_else(|_| panic!("Failed to open or parse FASTQ file: {}", path));
91+
92+
while let Some(record) = reader.next() {
93+
let seqrec = record.expect("Invalid record in FASTQ");
94+
// Use bit_kmers for canonical k-mers, which is robust to strandness
95+
for (_, kmer, _) in seqrec.sequence().bit_kmers(k, true) {
96+
*local_counts.entry(kmer.0).or_insert(0) += 1;
97+
}
98+
}
99+
pb.inc(1);
100+
local_counts
101+
})
102+
.reduce(FxHashMap::default, |mut a, b| {
103+
// Combine the k-mer counts from all parallel jobs
104+
for (kmer, count) in b {
105+
*a.entry(kmer).or_insert(0) += count;
106+
}
107+
a
108+
});
109+
110+
pb.finish_with_message("FASTQ k-mer counting complete.");
111+
Ok(kmer_counts)
112+
}
113+
114+
/// Extracts all reference sequences and their headers from a multi-FASTA file.
115+
fn read_references_from_multifasta(path: &str) -> AppResult<Vec<(String, Vec<u8>)>> {
116+
info!("📖 Loading reference genomes from multi-FASTA file...");
117+
let mut refs = Vec::new();
118+
let mut reader = parse_fastx_file(path)
119+
.map_err(|_| AppError::Generic(format!("Cannot open reference file: {}", path)))?;
120+
121+
while let Some(record) = reader.next() {
122+
let seqrec = record.map_err(|_| AppError::Parsing("Invalid record in reference FASTA.".to_string()))?;
123+
let header = seqrec.id();
124+
let seq = seqrec.seq().to_vec();
125+
refs.push((String::from_utf8_lossy(header).to_string(), seq));
126+
}
127+
info!(" Found {} reference sequences to compare against.", refs.len());
128+
Ok(refs)
129+
}
130+
131+
/// Main execution logic for the `match` subcommand.
132+
pub fn run(args: Args) -> AppResult<()> {
133+
if let Some(n) = args.threads {
134+
rayon::ThreadPoolBuilder::new()
135+
.num_threads(n)
136+
.build_global()
137+
.map_err(|e| AppError::Generic(format!("Failed to build thread pool: {}", e)))?;
138+
}
139+
140+
// Determine the list of FASTQ files to process from either --input or --input-list
141+
let fastq_files_to_process = if let Some(list_path) = &args.input_list {
142+
read_fastq_list_from_tsv(list_path)?
143+
} else if !args.fastqs.is_empty() {
144+
args.fastqs
145+
} else {
146+
return Err(AppError::Generic(
147+
"An input source is required: either --input or --input-list must be provided.".to_string(),
148+
));
149+
};
150+
151+
// Log which files are being processed
152+
info!("Analyzing FASTQ files: {}", fastq_files_to_process.join(", "));
153+
154+
// Step 1: Count k-mers from all input FASTQ files.
155+
let query_kmer_map = count_kmers_from_fastqs(&fastq_files_to_process, args.kmer_size)?;
156+
if query_kmer_map.is_empty() {
157+
return Err(AppError::NotEnoughData("No k-mers could be extracted from the input FASTQ files.".to_string()));
158+
}
159+
let total_query_kmers = query_kmer_map.values().sum::<u32>() as f64;
160+
debug!("Total k-mers counted in query: {}", total_query_kmers);
161+
162+
// Step 2: Load all reference genomes from the multi-FASTA file.
163+
let references = read_references_from_multifasta(&args.references)?;
164+
165+
// Step 3: Compare the query against each reference in parallel.
166+
info!("🔬 Comparing sample against references...");
167+
let pb = ProgressBar::new(references.len() as u64);
168+
pb.set_style(
169+
ProgressStyle::default_bar()
170+
.template(" {spinner:.green} Matching: [{bar:40.magenta/purple}] {pos}/{len} ({eta})")?
171+
.progress_chars("=>-"),
172+
);
173+
174+
let mut scores: Vec<(String, f64)> = references
175+
.par_iter()
176+
.map(|(header, seq)| {
177+
// Generate the set of unique k-mers for this reference.
178+
let ref_kmers: FxHashSet<u64> = seq.bit_kmers(args.kmer_size, true).map(|(_, kmer, _)| kmer.0).collect();
179+
180+
// Calculate the weighted containment score.
181+
let shared_kmer_count: u32 = query_kmer_map
182+
.iter()
183+
.filter(|(kmer, _)| ref_kmers.contains(kmer))
184+
.map(|(_, count)| count)
185+
.sum();
186+
187+
let score = if total_query_kmers > 0.0 {
188+
shared_kmer_count as f64 / total_query_kmers
189+
} else {
190+
0.0
191+
};
192+
pb.inc(1);
193+
(header.clone(), score)
194+
})
195+
.collect();
196+
197+
pb.finish_with_message("Matching complete.");
198+
199+
// Step 4: Sort and report only the best result.
200+
scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
201+
202+
info!("✅ Comparison finished. Writing report...");
203+
let mut writer: Box<dyn Write> = if let Some(path) = &args.output {
204+
Box::new(File::create(path)?)
205+
} else {
206+
Box::new(std::io::stdout())
207+
};
208+
209+
writeln!(writer, "Query_Files\tBest_Match_Reference\tShared_Kmer_Fraction")?;
210+
if let Some((best_header, best_score)) = scores.first() {
211+
let query_files_str = fastq_files_to_process.join(",");
212+
writeln!(writer, "{}\t{}\t{:.4}", query_files_str, best_header, best_score)?;
213+
} else {
214+
info!("No matching references found.");
215+
}
216+
217+
Ok(())
218+
}

0 commit comments

Comments
 (0)