|
| 1 | +//! src/match.rs |
| 2 | +//! |
| 3 | +//! Implements the `match` subcommand. |
| 4 | +//! |
| 5 | +//! This module takes input FASTQ reads and compares them against a collection |
| 6 | +//! of reference genomes provided in a single multi-FASTA file. It uses an |
| 7 | +//! efficient, k-mer-based weighted containment score to determine the best |
| 8 | +//! matching reference genome for the sample. |
| 9 | +
|
| 10 | +use crate::errors::{AppError, AppResult}; |
| 11 | +use clap::Parser; |
| 12 | +use indicatif::{ProgressBar, ProgressStyle}; |
| 13 | +use log::{debug, info, warn}; |
| 14 | +use needletail::{parse_fastx_file, Sequence}; |
| 15 | +use rayon::prelude::*; |
| 16 | +use rustc_hash::{FxHashMap, FxHashSet}; |
| 17 | +use std::fs; |
| 18 | +use std::fs::File; |
| 19 | +use std::io::Write; |
| 20 | + |
| 21 | +/// Command-line arguments for the `match` subcommand. |
| 22 | +#[derive(Parser, Debug)] |
| 23 | +pub struct Args { |
| 24 | + /// Path to one or more input FASTQ files (can be gzipped). Use this or --input-list. |
| 25 | + #[arg(short = 'i', long = "input", num_args = 1.., group = "input_method")] |
| 26 | + pub fastqs: Vec<String>, |
| 27 | + |
| 28 | + /// Path to a TSV file listing FASTQ files to process. |
| 29 | + /// Format: sample_name\t/path/to/reads1.fastq[\t/path/to/reads2.fastq] |
| 30 | + /// Use this or --input. |
| 31 | + #[arg(short = 'l', long = "input-list", group = "input_method")] |
| 32 | + pub input_list: Option<String>, |
| 33 | + |
| 34 | + /// Path to a single multi-FASTA file containing all reference genomes. |
| 35 | + #[arg(short = 'r', long = "references", required = true)] |
| 36 | + pub references: String, |
| 37 | + |
| 38 | + /// Path for the output TSV report. Prints to console if not provided. |
| 39 | + #[arg(short = 'o', long)] |
| 40 | + pub output: Option<String>, |
| 41 | + |
| 42 | + /// k-mer size for comparison. |
| 43 | + #[arg(short = 'k', long, default_value_t = 31)] |
| 44 | + pub kmer_size: u8, |
| 45 | + |
| 46 | + /// Number of CPU threads to use. Defaults to all available. |
| 47 | + #[arg(short = 't', long)] |
| 48 | + pub threads: Option<usize>, |
| 49 | +} |
| 50 | + |
| 51 | +/// Reads a list of FASTQ file paths from a TSV file. |
| 52 | +fn read_fastq_list_from_tsv(path: &str) -> AppResult<Vec<String>> { |
| 53 | + info!("Reading FASTQ file list from TSV: {}", path); |
| 54 | + let mut fastq_paths = Vec::new(); |
| 55 | + let content = fs::read_to_string(path)?; |
| 56 | + for line in content.lines() { |
| 57 | + if line.trim().is_empty() || line.starts_with('#') { |
| 58 | + continue; |
| 59 | + } |
| 60 | + // The format can be sample_name\tpath1\tpath2... |
| 61 | + // We collect all paths from the second column onwards. |
| 62 | + let paths: Vec<String> = line.split('\t').skip(1).map(String::from).collect(); |
| 63 | + if paths.is_empty() { |
| 64 | + warn!("Skipping malformed line in TSV (no paths found): {}", line); |
| 65 | + } |
| 66 | + fastq_paths.extend(paths); |
| 67 | + } |
| 68 | + if fastq_paths.is_empty() { |
| 69 | + return Err(AppError::NotEnoughData(format!("No FASTQ files found in the TSV list: {}", path))); |
| 70 | + } |
| 71 | + Ok(fastq_paths) |
| 72 | +} |
| 73 | + |
| 74 | + |
| 75 | +/// Counts k-mer occurrences from a set of FASTQ files in parallel. |
| 76 | +fn count_kmers_from_fastqs(paths: &[String], k: u8) -> AppResult<FxHashMap<u64, u32>> { |
| 77 | + info!("🧬 Counting k-mers from input FASTQ files..."); |
| 78 | + let pb = ProgressBar::new(paths.len() as u64); |
| 79 | + pb.set_style( |
| 80 | + ProgressStyle::default_bar() |
| 81 | + .template(" {spinner:.blue} Reading FASTQs: [{bar:40.cyan/blue}] {pos}/{len} {msg}")? |
| 82 | + .progress_chars("=>-"), |
| 83 | + ); |
| 84 | + |
| 85 | + let kmer_counts = paths |
| 86 | + .par_iter() |
| 87 | + .map(|path| { |
| 88 | + let mut local_counts = FxHashMap::default(); |
| 89 | + let mut reader = parse_fastx_file(path) |
| 90 | + .unwrap_or_else(|_| panic!("Failed to open or parse FASTQ file: {}", path)); |
| 91 | + |
| 92 | + while let Some(record) = reader.next() { |
| 93 | + let seqrec = record.expect("Invalid record in FASTQ"); |
| 94 | + // Use bit_kmers for canonical k-mers, which is robust to strandness |
| 95 | + for (_, kmer, _) in seqrec.sequence().bit_kmers(k, true) { |
| 96 | + *local_counts.entry(kmer.0).or_insert(0) += 1; |
| 97 | + } |
| 98 | + } |
| 99 | + pb.inc(1); |
| 100 | + local_counts |
| 101 | + }) |
| 102 | + .reduce(FxHashMap::default, |mut a, b| { |
| 103 | + // Combine the k-mer counts from all parallel jobs |
| 104 | + for (kmer, count) in b { |
| 105 | + *a.entry(kmer).or_insert(0) += count; |
| 106 | + } |
| 107 | + a |
| 108 | + }); |
| 109 | + |
| 110 | + pb.finish_with_message("FASTQ k-mer counting complete."); |
| 111 | + Ok(kmer_counts) |
| 112 | +} |
| 113 | + |
| 114 | +/// Extracts all reference sequences and their headers from a multi-FASTA file. |
| 115 | +fn read_references_from_multifasta(path: &str) -> AppResult<Vec<(String, Vec<u8>)>> { |
| 116 | + info!("📖 Loading reference genomes from multi-FASTA file..."); |
| 117 | + let mut refs = Vec::new(); |
| 118 | + let mut reader = parse_fastx_file(path) |
| 119 | + .map_err(|_| AppError::Generic(format!("Cannot open reference file: {}", path)))?; |
| 120 | + |
| 121 | + while let Some(record) = reader.next() { |
| 122 | + let seqrec = record.map_err(|_| AppError::Parsing("Invalid record in reference FASTA.".to_string()))?; |
| 123 | + let header = seqrec.id(); |
| 124 | + let seq = seqrec.seq().to_vec(); |
| 125 | + refs.push((String::from_utf8_lossy(header).to_string(), seq)); |
| 126 | + } |
| 127 | + info!(" Found {} reference sequences to compare against.", refs.len()); |
| 128 | + Ok(refs) |
| 129 | +} |
| 130 | + |
| 131 | +/// Main execution logic for the `match` subcommand. |
| 132 | +pub fn run(args: Args) -> AppResult<()> { |
| 133 | + if let Some(n) = args.threads { |
| 134 | + rayon::ThreadPoolBuilder::new() |
| 135 | + .num_threads(n) |
| 136 | + .build_global() |
| 137 | + .map_err(|e| AppError::Generic(format!("Failed to build thread pool: {}", e)))?; |
| 138 | + } |
| 139 | + |
| 140 | + // Determine the list of FASTQ files to process from either --input or --input-list |
| 141 | + let fastq_files_to_process = if let Some(list_path) = &args.input_list { |
| 142 | + read_fastq_list_from_tsv(list_path)? |
| 143 | + } else if !args.fastqs.is_empty() { |
| 144 | + args.fastqs |
| 145 | + } else { |
| 146 | + return Err(AppError::Generic( |
| 147 | + "An input source is required: either --input or --input-list must be provided.".to_string(), |
| 148 | + )); |
| 149 | + }; |
| 150 | + |
| 151 | + // Log which files are being processed |
| 152 | + info!("Analyzing FASTQ files: {}", fastq_files_to_process.join(", ")); |
| 153 | + |
| 154 | + // Step 1: Count k-mers from all input FASTQ files. |
| 155 | + let query_kmer_map = count_kmers_from_fastqs(&fastq_files_to_process, args.kmer_size)?; |
| 156 | + if query_kmer_map.is_empty() { |
| 157 | + return Err(AppError::NotEnoughData("No k-mers could be extracted from the input FASTQ files.".to_string())); |
| 158 | + } |
| 159 | + let total_query_kmers = query_kmer_map.values().sum::<u32>() as f64; |
| 160 | + debug!("Total k-mers counted in query: {}", total_query_kmers); |
| 161 | + |
| 162 | + // Step 2: Load all reference genomes from the multi-FASTA file. |
| 163 | + let references = read_references_from_multifasta(&args.references)?; |
| 164 | + |
| 165 | + // Step 3: Compare the query against each reference in parallel. |
| 166 | + info!("🔬 Comparing sample against references..."); |
| 167 | + let pb = ProgressBar::new(references.len() as u64); |
| 168 | + pb.set_style( |
| 169 | + ProgressStyle::default_bar() |
| 170 | + .template(" {spinner:.green} Matching: [{bar:40.magenta/purple}] {pos}/{len} ({eta})")? |
| 171 | + .progress_chars("=>-"), |
| 172 | + ); |
| 173 | + |
| 174 | + let mut scores: Vec<(String, f64)> = references |
| 175 | + .par_iter() |
| 176 | + .map(|(header, seq)| { |
| 177 | + // Generate the set of unique k-mers for this reference. |
| 178 | + let ref_kmers: FxHashSet<u64> = seq.bit_kmers(args.kmer_size, true).map(|(_, kmer, _)| kmer.0).collect(); |
| 179 | + |
| 180 | + // Calculate the weighted containment score. |
| 181 | + let shared_kmer_count: u32 = query_kmer_map |
| 182 | + .iter() |
| 183 | + .filter(|(kmer, _)| ref_kmers.contains(kmer)) |
| 184 | + .map(|(_, count)| count) |
| 185 | + .sum(); |
| 186 | + |
| 187 | + let score = if total_query_kmers > 0.0 { |
| 188 | + shared_kmer_count as f64 / total_query_kmers |
| 189 | + } else { |
| 190 | + 0.0 |
| 191 | + }; |
| 192 | + pb.inc(1); |
| 193 | + (header.clone(), score) |
| 194 | + }) |
| 195 | + .collect(); |
| 196 | + |
| 197 | + pb.finish_with_message("Matching complete."); |
| 198 | + |
| 199 | + // Step 4: Sort and report only the best result. |
| 200 | + scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); |
| 201 | + |
| 202 | + info!("✅ Comparison finished. Writing report..."); |
| 203 | + let mut writer: Box<dyn Write> = if let Some(path) = &args.output { |
| 204 | + Box::new(File::create(path)?) |
| 205 | + } else { |
| 206 | + Box::new(std::io::stdout()) |
| 207 | + }; |
| 208 | + |
| 209 | + writeln!(writer, "Query_Files\tBest_Match_Reference\tShared_Kmer_Fraction")?; |
| 210 | + if let Some((best_header, best_score)) = scores.first() { |
| 211 | + let query_files_str = fastq_files_to_process.join(","); |
| 212 | + writeln!(writer, "{}\t{}\t{:.4}", query_files_str, best_header, best_score)?; |
| 213 | + } else { |
| 214 | + info!("No matching references found."); |
| 215 | + } |
| 216 | + |
| 217 | + Ok(()) |
| 218 | +} |
0 commit comments