From e4c9a4794916f146727378e9959197a0df5c91a6 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 26 May 2026 12:00:19 +0200 Subject: [PATCH 1/9] feat(analyze): add mutation pattern config types and analysis engine - Tagged union event model (MutationPatternEvent::NucSubstitution) allows adding event types without changing output structure - Regex motifs matched against reference sequence replace fixed trinucleotide IUPAC context, supporting arbitrary-length patterns - Context uses global reference genome, matching mutational signature conventions where the enzyme acts on the physical genome --- Cargo.lock | 12 + Cargo.toml | 1 + packages/nextclade/Cargo.toml | 1 + packages/nextclade/src/analyze/mod.rs | 2 + .../src/analyze/mutation_patterns.rs | 1160 +++++++++++++++++ .../nextclade/src/analyze/nuc_sub_context.rs | 75 ++ .../nextclade/src/analyze/virus_properties.rs | 171 ++- 7 files changed, 1421 insertions(+), 1 deletion(-) create mode 100644 packages/nextclade/src/analyze/mutation_patterns.rs create mode 100644 packages/nextclade/src/analyze/nuc_sub_context.rs diff --git a/Cargo.lock b/Cargo.lock index 2dad7585b..2f6590377 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2154,6 +2154,7 @@ dependencies = [ "serde_repr", "serde_stacker", "serde_yaml", + "smart-default", "strsim", "strum 0.27.2", "strum_macros 0.27.2", @@ -3268,6 +3269,17 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smart-default" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eb01866308440fc64d6c44d9e86c5cc17adfe33c4d6eed55da9145044d0ffc1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.112", +] + [[package]] name = "socket2" version = "0.5.10" diff --git a/Cargo.toml b/Cargo.toml index d73c802ee..bd790b297 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,7 @@ serde_json = { version = "=1.0.148", features = ["preserve_order", "indexmap", " serde_repr = "=0.1.20" serde_stacker = { version = "=0.1.14" } serde_yaml = "=0.9.34" +smart-default = "=0.7.1" strsim = "=0.11.1" strum = "=0.27.2" strum_macros = "=0.27.2" diff --git a/packages/nextclade/Cargo.toml b/packages/nextclade/Cargo.toml index a61bc871f..acc7486ba 100644 --- a/packages/nextclade/Cargo.toml +++ b/packages/nextclade/Cargo.toml @@ -69,6 +69,7 @@ serde_json = { workspace = true } serde_repr = { workspace = true } serde_stacker = { workspace = true } serde_yaml = { workspace = true } +smart-default = { workspace = true } strsim = { workspace = true } strum = { workspace = true } strum_macros = { workspace = true } diff --git a/packages/nextclade/src/analyze/mod.rs b/packages/nextclade/src/analyze/mod.rs index 641c3f429..2bb614ae2 100644 --- a/packages/nextclade/src/analyze/mod.rs +++ b/packages/nextclade/src/analyze/mod.rs @@ -20,10 +20,12 @@ pub mod group_adjacent_deletions; pub mod is_sequenced; pub mod letter_composition; pub mod letter_ranges; +pub mod mutation_patterns; pub mod nuc_alignment; pub mod nuc_changes; pub mod nuc_del; pub mod nuc_sub; +pub mod nuc_sub_context; pub mod pcr_primer_changes; pub mod pcr_primers; pub mod phenotype; diff --git a/packages/nextclade/src/analyze/mutation_patterns.rs b/packages/nextclade/src/analyze/mutation_patterns.rs new file mode 100644 index 000000000..18d89895d --- /dev/null +++ b/packages/nextclade/src/analyze/mutation_patterns.rs @@ -0,0 +1,1160 @@ +use crate::alphabet::nuc::{Nuc, from_nuc_seq, is_nuc_match}; +use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; +use crate::analyze::nuc_sub::NucSub; +use crate::analyze::nuc_sub_context::NucSubWithContext; +use crate::analyze::virus_properties::{ + MutationPatternClusterConfig, MutationPatternEvent, MutationPatternNucSubstitution, MutationPatternsConfig, +}; +use crate::coord::position::{NucRefGlobalPosition, PositionLike}; +use crate::qc::qc_config::QcRulesConfigSnpClusters; +use eyre::{Report, WrapErr}; +use itertools::Itertools; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::collections::VecDeque; + +/// Cluster of mutation pattern events detected within one sliding nucleotide window. +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternCluster::example")] +pub struct MutationPatternCluster { + /// 0-based first reference position included in this cluster. + pub start: usize, + + /// 0-based last reference position included in this cluster. + pub end: usize, + + /// Number of matched events in this cluster. + pub count: usize, + + /// Matched events belonging to this cluster, sorted by reference position. + pub events: Vec, + + /// Counts of event types within this cluster. + pub event_type_counts: Vec, +} + +impl MutationPatternCluster { + pub fn example() -> Self { + Self { + start: 5003, + end: 5033, + count: 2, + events: vec![ + MutationPatternEventMatch::example_nuc_substitution(5003, Nuc::A, Nuc::G), + MutationPatternEventMatch::example_nuc_substitution(5033, Nuc::A, Nuc::G), + ], + event_type_counts: vec![MutationPatternEventTypeCount::NucSubstitution( + MutationPatternNucSubstitutionTypeCount { + ref_nuc: Nuc::A, + qry_nuc: Nuc::G, + count: 2, + }, + )], + } + } +} + +/// Count of matched mutation pattern events of one event type. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(tag = "type", rename_all = "camelCase")] +#[schemars(example = "MutationPatternEventTypeCount::example")] +pub enum MutationPatternEventTypeCount { + /// Count of nucleotide substitutions with the same reference and query nucleotide. + NucSubstitution(MutationPatternNucSubstitutionTypeCount), +} + +impl MutationPatternEventTypeCount { + pub const fn example() -> Self { + Self::NucSubstitution(MutationPatternNucSubstitutionTypeCount { + ref_nuc: Nuc::A, + qry_nuc: Nuc::G, + count: 8, + }) + } +} + +/// Count of nucleotide substitution events sharing the same reference and query nucleotide. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternNucSubstitutionTypeCount::example")] +pub struct MutationPatternNucSubstitutionTypeCount { + /// Reference nucleotide at the substituted position. + pub ref_nuc: Nuc, + + /// Query nucleotide at the substituted position. + pub qry_nuc: Nuc, + + /// Number of matching substitutions with this reference and query nucleotide pair. + pub count: usize, +} + +impl MutationPatternNucSubstitutionTypeCount { + pub const fn example() -> Self { + Self { + ref_nuc: Nuc::A, + qry_nuc: Nuc::G, + count: 8, + } + } +} + +/// Reference motif match that overlapped a mutation pattern event. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternMotifMatch::example")] +pub struct MutationPatternMotifMatch { + /// Regular expression from the pattern configuration that matched the reference sequence. + pub motif: String, + + /// 0-based first reference position included in the motif match. + pub start: usize, + + /// 0-based position after the end of the motif match. + pub end: usize, +} + +impl MutationPatternMotifMatch { + pub fn example() -> Self { + Self { + motif: "A[ACGT]G".to_owned(), + start: 5002, + end: 5005, + } + } +} + +/// Matched mutation pattern event. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(tag = "type", rename_all = "camelCase")] +#[schemars(example = "MutationPatternEventMatch::example")] +pub enum MutationPatternEventMatch { + /// Matched nucleotide substitution event. + NucSubstitution(MutationPatternNucSubstitutionMatch), +} + +impl MutationPatternEventMatch { + pub fn example() -> Self { + Self::example_nuc_substitution(5003, Nuc::A, Nuc::G) + } + + const fn unmatched_nuc_substitution(substitution: NucSubWithContext) -> Self { + Self::NucSubstitution(MutationPatternNucSubstitutionMatch { + substitution, + motif_matches: vec![], + }) + } + + fn example_nuc_substitution(pos: usize, ref_nuc: Nuc, qry_nuc: Nuc) -> Self { + Self::NucSubstitution(MutationPatternNucSubstitutionMatch { + substitution: NucSubWithContext { + sub: NucSub { + pos: NucRefGlobalPosition::from(pos), + ref_nuc, + qry_nuc, + }, + ref_context: vec![Nuc::A, ref_nuc, Nuc::G], + }, + motif_matches: vec![MutationPatternMotifMatch { + motif: "A[ACGT]G".to_owned(), + start: pos.saturating_sub(1), + end: pos + 2, + }], + }) + } +} + +/// Matched nucleotide substitution and the reference motifs that accepted it. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternNucSubstitutionMatch::example")] +pub struct MutationPatternNucSubstitutionMatch { + /// Nucleotide substitution plus local reference context at the substituted position. + #[serde(flatten)] + pub substitution: NucSubWithContext, + + /// Motif matches that spanned the substituted position. Empty when the pattern event had no motif restriction. + pub motif_matches: Vec, +} + +impl MutationPatternNucSubstitutionMatch { + pub fn example() -> Self { + match MutationPatternEventMatch::example() { + MutationPatternEventMatch::NucSubstitution(event) => event, + } + } +} + +/// Summary counts for one mutation pattern. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternCounts::example")] +pub struct MutationPatternCounts { + /// Number of private mutation events matching the pattern. + pub matches: usize, + + /// Number of matching events that belong to reported clusters. + pub clustered: usize, + + /// Number of clusters reported for this pattern. + pub clusters: usize, +} + +impl MutationPatternCounts { + pub const fn example() -> Self { + Self { + matches: 14, + clustered: 14, + clusters: 2, + } + } +} + +/// Results for a single mutation pattern. +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternResults::example")] +pub struct MutationPatternResults { + /// Stable machine-readable pattern identifier copied from pathogen.json. + pub id: String, + + /// Human-readable pattern name copied from pathogen.json. + pub name: String, + + /// All private mutation events matching this pattern. + pub matches: Vec, + + /// Counts of matched event types across all `matches`. + pub event_type_counts: Vec, + + /// Pattern-local clusters detected among `matches`. + pub clusters: Vec, + + /// Summary counts for matches and clusters. + pub counts: MutationPatternCounts, + + /// Optional explanatory text copied from pathogen.json. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, +} + +impl MutationPatternResults { + pub fn example() -> Self { + let cluster = MutationPatternCluster::example(); + Self { + id: "adar".to_owned(), + name: "ADAR-like RNA editing".to_owned(), + matches: cluster.events.clone(), + event_type_counts: cluster.event_type_counts.clone(), + clusters: vec![cluster], + counts: MutationPatternCounts { + matches: 2, + clustered: 2, + clusters: 1, + }, + description: Some("ADAR-mediated A-to-I editing observed as A>G and complementary T>C".to_owned()), + } + } +} + +/// Mutation pattern analysis output. Contains per-pattern results. +#[derive(Clone, Debug, Default, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternsResults::example")] +pub struct MutationPatternsResults { + /// Results for each configured mutation pattern. Empty when mutation pattern analysis is not configured and no global + /// SNP cluster compatibility output is needed. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub results: Vec, +} + +impl MutationPatternsResults { + pub fn example() -> Self { + Self { + results: vec![MutationPatternResults::example()], + } + } +} + +/// Internal result containing both filtered (for output) and unfiltered (for QC) clusters. +pub struct MutationPatternAnalysis { + pub results: MutationPatternsResults, + pub qc_clusters: Vec, +} + +/// Trinucleotide context uses the global reference sequence, not the parent tree node state. +/// This matches mutational signature conventions: the enzyme acts on the physical genome, +/// and context is defined by the reference genome flanking the substituted position. +pub fn analyze_mutation_patterns( + private_nuc_mutations: &PrivateNucMutations, + ref_seq: &[Nuc], + config: Option<&MutationPatternsConfig>, + qc_snp_clusters_config: Option<&QcRulesConfigSnpClusters>, +) -> Result { + let ref_seq_str = from_nuc_seq(ref_seq); + let context_subs: Vec = private_nuc_mutations + .private_substitutions + .iter() + .map(|sub| NucSubWithContext::from_sub(sub, ref_seq)) + .collect_vec(); + + let all_events = context_subs + .iter() + .cloned() + .map(MutationPatternEventMatch::unmatched_nuc_substitution) + .collect_vec(); + let event_type_counts = compute_event_type_counts(&all_events); + + let patterns = config.map_or(&[][..], |c| &c.patterns); + + let (qc_window_size, qc_cluster_cut_off) = resolve_qc_config(qc_snp_clusters_config); + + let qc_clusters = if qc_window_size > 0 && qc_cluster_cut_off > 0 { + find_clusters(&all_events, qc_window_size, qc_cluster_cut_off) + } else { + vec![] + }; + + let results = if patterns.is_empty() { + if qc_clusters.is_empty() { + vec![] + } else { + let total_clusters = qc_clusters.len(); + let total_clustered = qc_clusters.iter().map(|c| c.count).sum(); + vec![MutationPatternResults { + id: "snp_clusters".to_owned(), + name: "SNP clusters".to_owned(), + matches: all_events, + event_type_counts, + clusters: qc_clusters.clone(), + counts: MutationPatternCounts { + matches: context_subs.len(), + clustered: total_clustered, + clusters: total_clusters, + }, + description: None, + }] + } + } else { + patterns + .iter() + .map(|cfg| { + let compiled_events = compile_events(&cfg.events)?; + let matches = if compiled_events.is_empty() { + context_subs + .iter() + .cloned() + .map(MutationPatternEventMatch::unmatched_nuc_substitution) + .collect_vec() + } else { + context_subs + .iter() + .filter_map(|s| match_event(s, &compiled_events, &ref_seq_str)) + .collect_vec() + }; + let clusters = if let Some(cluster) = &cfg.cluster { + find_clusters_with_config(&matches, cluster) + } else { + vec![] + }; + let total_clusters = clusters.len(); + let total_clustered = clusters.iter().map(|c| c.count).sum(); + Ok(MutationPatternResults { + id: cfg.id.clone(), + name: cfg.name.clone(), + event_type_counts: compute_event_type_counts(&matches), + counts: MutationPatternCounts { + matches: matches.len(), + clustered: total_clustered, + clusters: total_clusters, + }, + matches, + clusters, + description: cfg.description.clone(), + }) + }) + .collect::, Report>>()? + }; + + Ok(MutationPatternAnalysis { + results: MutationPatternsResults { results }, + qc_clusters, + }) +} + +fn compute_event_type_counts(events: &[MutationPatternEventMatch]) -> Vec { + let mut counts: BTreeMap<(Nuc, Nuc), usize> = BTreeMap::new(); + for event in events { + let MutationPatternEventMatch::NucSubstitution(event) = event; + *counts + .entry((event.substitution.sub.ref_nuc, event.substitution.sub.qry_nuc)) + .or_default() += 1; + } + counts + .into_iter() + .map(|((ref_nuc, qry_nuc), count)| { + MutationPatternEventTypeCount::NucSubstitution(MutationPatternNucSubstitutionTypeCount { + ref_nuc, + qry_nuc, + count, + }) + }) + .collect_vec() +} + +fn event_match_position(event: &MutationPatternEventMatch) -> usize { + match event { + MutationPatternEventMatch::NucSubstitution(event) => event.substitution.sub.pos.as_usize(), + } +} + +fn compile_events(events: &[MutationPatternEvent]) -> Result, Report> { + events + .iter() + .map(CompiledEvent::new) + .collect::, Report>>() +} + +fn match_event( + sub: &NucSubWithContext, + events: &[CompiledEvent], + ref_seq_str: &str, +) -> Option { + events + .iter() + .find_map(|event| event.match_substitution(sub, ref_seq_str)) +} + +fn find_clusters_with_config( + events: &[MutationPatternEventMatch], + config: &MutationPatternClusterConfig, +) -> Vec { + if config.window_size > 0 && config.cutoff > 0 { + find_clusters(events, config.window_size, config.cutoff) + } else { + vec![] + } +} + +enum CompiledEvent { + NucSubstitution(CompiledNucSubstitution), +} + +impl CompiledEvent { + fn new(event: &MutationPatternEvent) -> Result { + match event { + MutationPatternEvent::NucSubstitution(event) => Ok(Self::NucSubstitution(CompiledNucSubstitution::new(event)?)), + } + } + + fn match_substitution(&self, sub: &NucSubWithContext, ref_seq_str: &str) -> Option { + match self { + Self::NucSubstitution(event) => event.match_substitution(sub, ref_seq_str), + } + } +} + +struct CompiledNucSubstitution { + ref_nucs: Vec, + qry: Vec, + motifs: Vec, +} + +impl CompiledNucSubstitution { + fn new(event: &MutationPatternNucSubstitution) -> Result { + Ok(Self { + ref_nucs: event.ref_nucs.clone(), + qry: event.qry.clone(), + motifs: event + .motifs + .iter() + .map(|motif| CompiledMotif::new(motif)) + .collect::, Report>>()?, + }) + } + + fn match_substitution(&self, sub: &NucSubWithContext, ref_seq_str: &str) -> Option { + if !self.ref_nucs.iter().any(|nuc| is_nuc_match(*nuc, sub.sub.ref_nuc)) { + return None; + } + if !self.qry.iter().any(|nuc| is_nuc_match(*nuc, sub.sub.qry_nuc)) { + return None; + } + + if self.motifs.is_empty() { + return Some(MutationPatternEventMatch::unmatched_nuc_substitution(sub.clone())); + } + + let motif_matches = self + .motifs + .iter() + .flat_map(|motif| motif.find_matches_containing(ref_seq_str, sub.sub.pos.as_usize())) + .collect_vec(); + + if motif_matches.is_empty() { + None + } else { + Some(MutationPatternEventMatch::NucSubstitution( + MutationPatternNucSubstitutionMatch { + substitution: sub.clone(), + motif_matches, + }, + )) + } + } +} + +struct CompiledMotif { + motif: String, + regex: Regex, +} + +impl CompiledMotif { + fn new(motif: &str) -> Result { + if motif.is_empty() { + eyre::bail!("Mutation pattern motif cannot be empty"); + } + Ok(Self { + motif: motif.to_owned(), + regex: Regex::new(motif).wrap_err_with(|| format!("When compiling mutation pattern motif '{motif}'"))?, + }) + } + + fn find_matches_containing(&self, ref_seq_str: &str, pos: usize) -> Vec { + self + .regex + .find_iter(ref_seq_str) + .filter(|m| m.start() <= pos && pos < m.end()) + .map(|m| MutationPatternMotifMatch { + motif: self.motif.clone(), + start: m.start(), + end: m.end(), + }) + .collect_vec() + } +} + +const fn resolve_qc_config(qc_snp_clusters_config: Option<&QcRulesConfigSnpClusters>) -> (usize, usize) { + if let Some(qc_snp_clusters) = qc_snp_clusters_config { + if qc_snp_clusters.enabled { + (qc_snp_clusters.window_size, qc_snp_clusters.cluster_cut_off) + } else { + (0, 0) + } + } else { + (0, 0) + } +} + +fn find_clusters( + events: &[MutationPatternEventMatch], + window_size: usize, + cluster_cut_off: usize, +) -> Vec { + let mut current_window = VecDeque::<&MutationPatternEventMatch>::new(); + let mut all_clusters: Vec> = Vec::new(); + let mut previous_pos: isize = -1; + + for event in events { + let pos = event_match_position(event) as isize; + current_window.push_back(event); + + while (event_match_position(current_window[0]) as isize) < (pos - window_size as isize) { + current_window.pop_front(); + } + + if current_window.len() > cluster_cut_off { + let n_clusters = all_clusters.len(); + + if !all_clusters.is_empty() && current_window.len() > 1 { + let last_cluster = &all_clusters[n_clusters - 1]; + let last_pos = event_match_position(last_cluster[last_cluster.len() - 1]) as isize; + + if last_pos == previous_pos { + all_clusters[n_clusters - 1].push(event); + } else { + all_clusters.push(current_window.iter().copied().collect_vec()); + } + } else { + all_clusters.push(current_window.iter().copied().collect_vec()); + } + } + previous_pos = pos; + } + + for cluster in &mut all_clusters { + cluster.sort_by_key(|event| event_match_position(event)); + cluster.dedup_by_key(|event| event_match_position(event)); + } + + all_clusters + .into_iter() + .map(|cluster_events| { + let start = event_match_position(cluster_events[0]); + let end = event_match_position(cluster_events[cluster_events.len() - 1]); + let events: Vec = cluster_events.into_iter().cloned().collect(); + let event_type_counts = compute_event_type_counts(&events); + let count = events.len(); + MutationPatternCluster { + start, + end, + count, + events, + event_type_counts, + } + }) + .collect_vec() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::alphabet::nuc::to_nuc_seq; + use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; + use crate::analyze::nuc_sub::NucSub; + use crate::analyze::virus_properties::MutationPatternConfig; + use crate::coord::position::NucRefGlobalPosition; + use crate::io::json::{JsonPretty, json_parse, json_stringify}; + use eyre::Report; + use pretty_assertions::assert_eq; + use serde_json::Value; + + fn make_sub(pos: usize, ref_nuc: Nuc, qry_nuc: Nuc) -> NucSub { + NucSub { + pos: NucRefGlobalPosition::from(pos), + ref_nuc, + qry_nuc, + } + } + + fn make_private_muts(subs: Vec) -> PrivateNucMutations { + let total = subs.len(); + PrivateNucMutations { + private_substitutions: subs, + total_private_substitutions: total, + ..PrivateNucMutations::default() + } + } + + fn ref_seq_acgt(len: usize) -> Vec { + let pattern = [Nuc::A, Nuc::C, Nuc::G, Nuc::T]; + (0..len).map(|i| pattern[i % 4]).collect() + } + + fn wrap(patterns: Vec) -> MutationPatternsConfig { + MutationPatternsConfig { patterns } + } + + fn cluster(window_size: usize, cutoff: usize) -> MutationPatternClusterConfig { + MutationPatternClusterConfig { window_size, cutoff } + } + + fn qc_cluster(window_size: usize, cluster_cut_off: usize) -> QcRulesConfigSnpClusters { + QcRulesConfigSnpClusters { + enabled: true, + score_weight: ordered_float::OrderedFloat(50.0), + window_size, + cluster_cut_off, + } + } + + fn event_type_counts_map(counts: &[MutationPatternEventTypeCount]) -> BTreeMap<(Nuc, Nuc), usize> { + counts + .iter() + .map(|c| match c { + MutationPatternEventTypeCount::NucSubstitution(c) => ((c.ref_nuc, c.qry_nuc), c.count), + }) + .collect() + } + + fn event_type_counts_total(counts: &[MutationPatternEventTypeCount]) -> usize { + counts + .iter() + .map(|c| match c { + MutationPatternEventTypeCount::NucSubstitution(c) => c.count, + }) + .sum() + } + + fn assert_cluster_events_are_nuc_substitutions(cluster: &MutationPatternCluster, ref_nuc: Nuc, qry_nuc: Nuc) { + for event in &cluster.events { + let MutationPatternEventMatch::NucSubstitution(event) = event; + assert_eq!(ref_nuc, event.substitution.sub.ref_nuc); + assert_eq!(qry_nuc, event.substitution.sub.qry_nuc); + } + } + + fn pattern_all(window_size: usize, cutoff: usize) -> MutationPatternConfig { + MutationPatternConfig { + id: "all".to_owned(), + name: "All private substitutions".to_owned(), + cluster: Some(cluster(window_size, cutoff)), + ..MutationPatternConfig::default() + } + } + + fn pattern_substitution( + id: &str, + name: &str, + ref_nucs: Vec, + qry: Vec, + motifs: Vec, + window_size: usize, + cutoff: usize, + ) -> MutationPatternConfig { + MutationPatternConfig { + id: id.to_owned(), + name: name.to_owned(), + events: vec![MutationPatternEvent::NucSubstitution(MutationPatternNucSubstitution { + ref_nucs, + qry, + motifs, + })], + cluster: Some(cluster(window_size, cutoff)), + ..MutationPatternConfig::default() + } + } + + #[test] + fn test_mutation_patterns_empty_input() -> Result<(), Report> { + let private_muts = make_private_muts(vec![]); + let ref_seq = ref_seq_acgt(100); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, None)?; + assert!(analysis.results.results.is_empty()); + assert!(analysis.qc_clusters.is_empty()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_type_counts() -> Result<(), Report> { + let subs = vec![ + make_sub(10, Nuc::T, Nuc::C), + make_sub(20, Nuc::T, Nuc::C), + make_sub(30, Nuc::A, Nuc::G), + make_sub(40, Nuc::C, Nuc::T), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(100); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + + let counts = event_type_counts_map(&result.event_type_counts); + + assert_eq!(Some(&2), counts.get(&(Nuc::T, Nuc::C))); + assert_eq!(Some(&1), counts.get(&(Nuc::A, Nuc::G))); + assert_eq!(Some(&1), counts.get(&(Nuc::C, Nuc::T))); + assert_eq!(4, event_type_counts_total(&result.event_type_counts)); + Ok(()) + } + + #[test] + fn test_mutation_patterns_no_clusters_without_config() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(100); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, None)?; + assert!(analysis.results.results.is_empty()); + assert!(analysis.qc_clusters.is_empty()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_clusters_with_config() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(200); + let subs: Vec = (0..10) + .map(|i| { + let pos = i * 5; + make_sub(pos, ref_seq[pos], Nuc::C) + }) + .collect(); + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 3); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(1, result.counts.clusters); + assert_eq!(10, result.counts.clustered); + let cluster = &result.clusters[0]; + assert_eq!(0, cluster.start); + assert_eq!(45, cluster.end); + assert_eq!(10, cluster.count); + assert_eq!(10, cluster.events.len()); + for event in &cluster.events { + let pos = event_match_position(event); + assert!(pos >= cluster.start); + assert!(pos <= cluster.end); + } + Ok(()) + } + + #[test] + fn test_mutation_patterns_two_separate_clusters() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(2000); + let mut subs = Vec::new(); + for i in 0..8 { + let pos = i * 5; + subs.push(make_sub(pos, ref_seq[pos], Nuc::C)); + } + for i in 0..8 { + let pos = 1000 + i * 5; + subs.push(make_sub(pos, ref_seq[pos], Nuc::G)); + } + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(2, result.counts.clusters); + assert!(result.clusters[0].end < result.clusters[1].start); + Ok(()) + } + + #[test] + fn test_mutation_patterns_cutoff_boundary() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(200); + let subs: Vec = (0..5) + .map(|i| { + let pos = i * 5; + make_sub(pos, ref_seq[pos], Nuc::C) + }) + .collect(); + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let qc = qc_cluster(100, 3); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + assert_eq!(0, analysis.results.results[0].counts.clusters); + Ok(()) + } + + #[test] + fn test_mutation_patterns_cutoff_plus_one() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(200); + let subs: Vec = (0..6) + .map(|i| { + let pos = i * 5; + make_sub(pos, ref_seq[pos], Nuc::C) + }) + .collect(); + let private_muts = make_private_muts(subs); + let config = wrap(vec![pattern_all(100, 5)]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + assert_eq!(1, analysis.results.results[0].counts.clusters); + assert_eq!(6, analysis.results.results[0].counts.clustered); + Ok(()) + } + + #[test] + fn test_mutation_patterns_type_filter() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::T, Nuc::C), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::T, Nuc::C), + make_sub(20, Nuc::T, Nuc::C), + make_sub(25, Nuc::A, Nuc::G), + make_sub(30, Nuc::T, Nuc::C), + make_sub(35, Nuc::T, Nuc::C), + make_sub(40, Nuc::T, Nuc::C), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 5, + )]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(1, result.counts.clusters); + for cluster in &result.clusters { + assert_cluster_events_are_nuc_substitutions(cluster, Nuc::T, Nuc::C); + } + assert_eq!(6, event_type_counts_total(&result.event_type_counts)); + assert_eq!(6, result.counts.matches); + Ok(()) + } + + #[test] + fn test_mutation_patterns_filter_separates_qc_from_filtered() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::A, Nuc::G), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::A, Nuc::G), + make_sub(20, Nuc::A, Nuc::G), + make_sub(25, Nuc::A, Nuc::G), + make_sub(30, Nuc::A, Nuc::G), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 5, + )]); + let qc = qc_cluster(100, 5); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + assert_eq!(0, analysis.results.results[0].counts.clusters); + assert_eq!(1, analysis.qc_clusters.len()); + assert_eq!(6, analysis.qc_clusters[0].count); + Ok(()) + } + + #[test] + fn test_mutation_patterns_motif_filter_observable() -> Result<(), Report> { + // 0 1 2 3 + // 01234567890123456789012345678901234 + let ref_seq = to_nuc_seq("ACGTATCATAACGTAACGTAACGTAACGTAACGTA")?; + // ^^^ *** ^^^ ^^^ ^^^ ^^^ ^^^ + // M1 X1 M2 M3 M4 M5 M6 + // ^ M1-M6: [ACGT]CG motif matches ACG contexts + // * X1: [ACGT]CG motif rejects TCA context + let subs = vec![ + make_sub(1, Nuc::C, Nuc::T), + make_sub(6, Nuc::C, Nuc::T), + make_sub(11, Nuc::C, Nuc::T), + make_sub(16, Nuc::C, Nuc::T), + make_sub(21, Nuc::C, Nuc::T), + make_sub(26, Nuc::C, Nuc::T), + make_sub(31, Nuc::C, Nuc::T), + ]; + let private_muts = make_private_muts(subs); + let config = json_parse::( + r#"{ + "patterns": [ + { + "id": "apobec", + "name": "APOBEC-like", + "events": [ + { + "type": "nucSubstitution", + "ref": ["C"], + "qry": ["T"], + "motifs": ["[ACGT]CG"] + } + ], + "cluster": { + "windowSize": 100, + "cutoff": 3 + } + } + ] + }"#, + )?; + let qc = qc_cluster(100, 3); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&qc))?; + let result = &analysis.results.results[0]; + assert_eq!(1, result.counts.clusters); + assert_eq!(6, result.matches.len()); + assert_eq!(6, result.clusters[0].count); + assert!(result.matches.iter().any(|m| match m { + MutationPatternEventMatch::NucSubstitution(m) => m.motif_matches.iter().any(|m| m.motif == "[ACGT]CG"), + })); + assert_eq!(1, analysis.qc_clusters.len()); + assert_eq!(7, analysis.qc_clusters[0].count); + Ok(()) + } + + #[test] + fn test_mutation_patterns_output_shape_is_event_oriented() -> Result<(), Report> { + let ref_seq = ref_seq_acgt(20); + let private_muts = make_private_muts(vec![make_sub(3, Nuc::T, Nuc::C), make_sub(7, Nuc::T, Nuc::C)]); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 1, + )]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + let actual = json_stringify(&analysis.results, JsonPretty(true))?; + let actual = json_parse::(&actual)?; + + let result = &actual["results"][0]; + assert!(result.get("eventTypeCounts").is_some()); + assert!(result.get("counts").is_some()); + assert!(result.get("substitutionTypeCounts").is_none()); + assert!(result.get("totalClusters").is_none()); + assert!(result.get("totalClusteredSnps").is_none()); + + let cluster = &result["clusters"][0]; + assert!(cluster.get("count").is_some()); + assert!(cluster.get("events").is_some()); + assert!(cluster.get("eventTypeCounts").is_some()); + assert!(cluster.get("numberOfSnps").is_none()); + assert!(cluster.get("substitutions").is_none()); + assert!(cluster.get("substitutionTypeCounts").is_none()); + + Ok(()) + } + + #[test] + fn test_mutation_patterns_motif_accepts_non_trinucleotide() -> Result<(), Report> { + let result = json_parse::( + r#"{ + "type": "nucSubstitution", + "ref": ["C"], + "qry": ["T"], + "motifs": ["TC"] + }"#, + )?; + let MutationPatternEvent::NucSubstitution(result) = result; + assert_eq!("TC", result.motifs[0]); + Ok(()) + } + + #[test] + fn test_mutation_patterns_filter_matches_iupac() -> Result<(), Report> { + let filter = MutationPatternEvent::NucSubstitution(MutationPatternNucSubstitution { + ref_nucs: vec![Nuc::R], + qry: vec![Nuc::N], + motifs: vec![], + }); + let compiled = compile_events(&[filter])?; + let ref_seq = to_nuc_seq("AAA")?; + let sub_a = NucSubWithContext::from_sub(&make_sub(1, Nuc::A, Nuc::G), &ref_seq); + let sub_c = NucSubWithContext::from_sub(&make_sub(1, Nuc::C, Nuc::G), &ref_seq); + assert!(match_event(&sub_a, &compiled, "AAA").is_some()); + assert!(match_event(&sub_c, &compiled, "ACA").is_none()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_qc_config_fallback() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let legacy = QcRulesConfigSnpClusters { + enabled: true, + score_weight: ordered_float::OrderedFloat(50.0), + window_size: 100, + cluster_cut_off: 5, + }; + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, Some(&legacy))?; + assert_eq!(1, analysis.results.results.len()); + assert_eq!(1, analysis.results.results[0].counts.clusters); + assert_eq!(10, analysis.results.results[0].counts.clustered); + assert_eq!(1, analysis.qc_clusters.len()); + assert_eq!(10, analysis.qc_clusters[0].count); + Ok(()) + } + + #[test] + fn test_mutation_patterns_qc_config_disabled() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let legacy = QcRulesConfigSnpClusters { + enabled: false, + score_weight: ordered_float::OrderedFloat(50.0), + window_size: 100, + cluster_cut_off: 5, + }; + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, None, Some(&legacy))?; + assert!(analysis.results.results.is_empty()); + assert!(analysis.qc_clusters.is_empty()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_type_counts_use_matches() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::T, Nuc::C), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::T, Nuc::C), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_substitution( + "tc", + "T>C", + vec![Nuc::T], + vec![Nuc::C], + vec![], + 100, + 1, + )]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + assert_eq!( + 2, + event_type_counts_total(&analysis.results.results[0].event_type_counts) + ); + assert_eq!(2, analysis.results.results[0].counts.matches); + Ok(()) + } + + #[test] + fn test_mutation_patterns_pattern_cluster_does_not_override_qc_config() -> Result<(), Report> { + let subs: Vec = (0..10).map(|i| make_sub(i * 5, Nuc::T, Nuc::C)).collect(); + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![pattern_all(10, 5)]); + let legacy = QcRulesConfigSnpClusters { + enabled: true, + score_weight: ordered_float::OrderedFloat(50.0), + window_size: 100, + cluster_cut_off: 5, + }; + let with_new = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), Some(&legacy))?; + let without_new = analyze_mutation_patterns(&private_muts, &ref_seq, None, Some(&legacy))?; + assert_eq!(without_new.qc_clusters.len(), with_new.qc_clusters.len()); + assert_eq!(without_new.qc_clusters[0].start, with_new.qc_clusters[0].start); + assert_eq!(without_new.qc_clusters[0].end, with_new.qc_clusters[0].end); + assert_eq!(without_new.qc_clusters[0].count, with_new.qc_clusters[0].count); + assert_eq!(0, with_new.results.results[0].counts.clusters); + assert_eq!(1, with_new.qc_clusters.len()); + Ok(()) + } + + #[test] + fn test_mutation_patterns_multiple_configs() -> Result<(), Report> { + let subs = vec![ + make_sub(5, Nuc::T, Nuc::C), + make_sub(10, Nuc::A, Nuc::G), + make_sub(15, Nuc::T, Nuc::C), + make_sub(20, Nuc::T, Nuc::C), + make_sub(25, Nuc::A, Nuc::G), + make_sub(30, Nuc::T, Nuc::C), + make_sub(35, Nuc::T, Nuc::C), + make_sub(40, Nuc::T, Nuc::C), + ]; + let private_muts = make_private_muts(subs); + let ref_seq = ref_seq_acgt(200); + let config = wrap(vec![ + MutationPatternConfig { + description: Some("T>C editing".to_owned()), + ..pattern_substitution("tc", "T>C", vec![Nuc::T], vec![Nuc::C], vec![], 100, 5) + }, + MutationPatternConfig { + description: Some("A>G editing".to_owned()), + ..pattern_substitution("ag", "A>G", vec![Nuc::A], vec![Nuc::G], vec![], 100, 1) + }, + ]); + let analysis = analyze_mutation_patterns(&private_muts, &ref_seq, Some(&config), None)?; + let results = &analysis.results.results; + assert_eq!(2, results.len()); + assert_eq!(1, results[0].counts.clusters); + assert_eq!(Some("T>C editing"), results[0].description.as_deref()); + assert_eq!(1, results[1].counts.clusters); + assert_eq!(Some("A>G editing"), results[1].description.as_deref()); + for cluster in &results[0].clusters { + assert_cluster_events_are_nuc_substitutions(cluster, Nuc::T, Nuc::C); + } + for cluster in &results[1].clusters { + assert_cluster_events_are_nuc_substitutions(cluster, Nuc::A, Nuc::G); + } + Ok(()) + } +} diff --git a/packages/nextclade/src/analyze/nuc_sub_context.rs b/packages/nextclade/src/analyze/nuc_sub_context.rs new file mode 100644 index 000000000..ba0b89330 --- /dev/null +++ b/packages/nextclade/src/analyze/nuc_sub_context.rs @@ -0,0 +1,75 @@ +use crate::alphabet::nuc::Nuc; +use crate::analyze::nuc_sub::NucSub; +use crate::coord::position::PositionLike; +use serde::{Deserialize, Serialize}; + +/// Nucleotide substitution with local reference nucleotide context centered on the substituted position. +/// +/// Current output contains `[upstream, current, downstream]`. `current` is the substituted reference nucleotide. +/// Boundary positions use gap characters for absent flanking nucleotides. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Hash)] +#[serde(rename_all = "camelCase")] +pub struct NucSubWithContext { + /// Nucleotide substitution. + #[serde(flatten)] + pub sub: NucSub, + + /// Reference nucleotide context around the substituted position. + pub ref_context: Vec, +} + +impl NucSubWithContext { + pub fn from_sub(sub: &NucSub, ref_seq: &[Nuc]) -> Self { + let pos = sub.pos.as_usize(); + let upstream = if pos > 0 { ref_seq[pos - 1] } else { Nuc::Gap }; + let current = ref_seq[pos]; + let downstream = if pos + 1 < ref_seq.len() { + ref_seq[pos + 1] + } else { + Nuc::Gap + }; + Self { + sub: sub.clone(), + ref_context: vec![upstream, current, downstream], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::alphabet::nuc::to_nuc_seq; + use crate::coord::position::NucRefGlobalPosition; + use eyre::Report; + use pretty_assertions::assert_eq; + use rstest::rstest; + + fn make_sub(pos: usize, ref_nuc: Nuc, qry_nuc: Nuc) -> NucSub { + NucSub { + pos: NucRefGlobalPosition::from(pos), + ref_nuc, + qry_nuc, + } + } + + #[rustfmt::skip] + #[rstest] + #[case::interior( 5, "ACGTACGT", "ACG")] + #[case::first_pos( 0, "TAC", "-TA")] + #[case::last_pos( 2, "ACG", "CG-")] + #[case::single_base(0, "T", "-T-")] + #[trace] + fn test_nuc_sub_context_from_sub( + #[case] pos: usize, + #[case] ref_seq: &str, + #[case] expected_context: &str, + ) -> Result<(), Report> { + let ref_seq = to_nuc_seq(ref_seq)?; + let expected_context = to_nuc_seq(expected_context)?; + let sub = make_sub(pos, ref_seq[pos], Nuc::A); + let result = NucSubWithContext::from_sub(&sub, &ref_seq); + assert_eq!(expected_context, result.ref_context); + assert_eq!(ref_seq[pos], result.ref_context[1]); + Ok(()) + } +} diff --git a/packages/nextclade/src/analyze/virus_properties.rs b/packages/nextclade/src/analyze/virus_properties.rs index 59d447f7a..a9f6c7866 100644 --- a/packages/nextclade/src/analyze/virus_properties.rs +++ b/packages/nextclade/src/analyze/virus_properties.rs @@ -19,9 +19,9 @@ use crate::{o, vec_of_owned}; use eyre::{Report, WrapErr}; use maplit::btreemap; use ordered_float::OrderedFloat; -use schemars; use semver::Version; use serde::{Deserialize, Serialize}; +use smart_default::SmartDefault; use std::collections::BTreeMap; use std::path::Path; use validator::Validate; @@ -55,6 +55,169 @@ impl PathogenAttributes { } } +/// Mutation event filter in mutation pattern analysis. +/// +/// Each event selects one class of private mutations to include in a pattern. More event variants can be added without +/// changing the surrounding `mutationPatterns.patterns[]` structure. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(tag = "type", rename_all = "camelCase")] +#[schemars(example = "MutationPatternEvent::example")] +pub enum MutationPatternEvent { + /// Nucleotide substitution event, selected by reference nucleotide, query nucleotide, and optional reference motifs. + NucSubstitution(MutationPatternNucSubstitution), +} + +impl MutationPatternEvent { + pub fn example() -> Self { + Self::NucSubstitution(MutationPatternNucSubstitution::example()) + } + + pub const fn matches_all_contexts(&self) -> bool { + match self { + Self::NucSubstitution(event) => event.motifs.is_empty(), + } + } +} + +/// Filter for selecting nucleotide substitutions in mutation pattern analysis. +/// +/// A substitution matches this event when its reference nucleotide is in `ref`, its query nucleotide is in `qry`, and at +/// least one motif matches the surrounding reference sequence. If `motifs` is empty, the event matches without a sequence +/// context restriction. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternNucSubstitution::example")] +pub struct MutationPatternNucSubstitution { + /// Reference nucleotides to match at the mutated position. + #[serde(rename = "ref")] + pub ref_nucs: Vec, + + /// Query nucleotides to match at the mutated position. + pub qry: Vec, + + /// Regular expressions matched against the reference sequence. A mutation matches a motif when the regex match spans + /// the mutated position. Use IUPAC nucleotide letters directly in the regex, for example `TC[AT]` for a TCW motif. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub motifs: Vec, +} + +impl MutationPatternNucSubstitution { + pub fn example() -> Self { + Self { + ref_nucs: vec![Nuc::A, Nuc::T], + qry: vec![Nuc::G, Nuc::C], + motifs: vec_of_owned!["A[ACGT]G", "T[ACGT]C"], + } + } +} + +/// Clustering rule applied to events matching one mutation pattern. +/// +/// Clustering is pattern-local. It decides which matched events are reported as dense clusters in this pattern. It does +/// not change the global `qc.snpClusters` rule unless the QC rule itself is configured separately. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, SmartDefault, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[serde(default)] +#[schemars(example = "MutationPatternClusterConfig::example")] +pub struct MutationPatternClusterConfig { + /// Sliding nucleotide window size. A cluster is detected when `cutoff` matched events fall within this many reference + /// nucleotides. + #[default = 100] + pub window_size: usize, + + /// Minimum number of matched events in one sliding window required to report a cluster. + #[default = 5] + pub cutoff: usize, +} + +impl MutationPatternClusterConfig { + pub const fn example() -> Self { + Self { + window_size: 100, + cutoff: 5, + } + } +} + +/// Named mutation pattern: event filters, optional clustering, and display metadata. +/// +/// Dataset authors can define multiple patterns to separate biologically different mutation processes, such as +/// ADAR-like A-to-I editing and APOBEC-like cytosine deamination. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternConfig::example")] +pub struct MutationPatternConfig { + /// Stable machine-readable identifier. This value appears in JSON and TSV output. + pub id: String, + + /// Human-readable name shown in Nextclade Web tooltips and reports. + pub name: String, + + /// Optional explanatory text shown together with the pattern result. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, + + /// Event filters included in this pattern. If empty, the pattern matches all private nucleotide substitutions. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub events: Vec, + + /// Optional pattern-local clustering rule. If omitted, Nextclade reports matches and type counts, but no clusters for + /// this pattern. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cluster: Option, +} + +impl MutationPatternConfig { + pub fn example() -> Self { + Self { + id: o!("adar"), + name: o!("ADAR-like RNA editing"), + description: Some(o!("ADAR-mediated A-to-I editing observed as A>G and complementary T>C")), + events: vec![MutationPatternEvent::NucSubstitution( + MutationPatternNucSubstitution::example(), + )], + cluster: Some(MutationPatternClusterConfig::example()), + } + } +} + +/// Configuration for mutation pattern analysis. Detects private mutations matching biologically meaningful mutation +/// type and reference-context rules. Pattern-specific clustering is independent from global `qc.snpClusters`. +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +#[schemars(example = "MutationPatternsConfig::example")] +pub struct MutationPatternsConfig { + /// Mutation patterns evaluated independently for every analyzed sequence. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub patterns: Vec, +} + +impl MutationPatternsConfig { + pub fn example() -> Self { + Self { + patterns: vec![ + MutationPatternConfig::example(), + MutationPatternConfig { + id: o!("apobec"), + name: o!("APOBEC-like cytosine deamination"), + description: Some(o!( + "APOBEC-like cytosine deamination observed as C>T and complementary G>A" + )), + events: vec![MutationPatternEvent::NucSubstitution(MutationPatternNucSubstitution { + ref_nucs: vec![Nuc::C, Nuc::G], + qry: vec![Nuc::T, Nuc::A], + motifs: vec_of_owned!["TC[AT]", "[AT]GA"], + })], + cluster: Some(MutationPatternClusterConfig { + window_size: 50, + cutoff: 4, + }), + }, + ], + } + } +} + /// pathogen.json dataset file. Contains external configuration and data specific for a particular pathogen. #[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] @@ -95,6 +258,11 @@ pub struct VirusProperties { #[serde(default, skip_serializing_if = "Option::is_none")] pub qc: Option, + /// Mutation pattern analysis configuration. When present, detects private mutation patterns such as enzyme-associated + /// clustered substitution signatures. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mutation_patterns: Option, + /// General analysis parameters (e.g. includeReference, inOrder, replaceUnknown). #[serde(default, skip_serializing_if = "Option::is_none")] pub general_params: Option, @@ -416,6 +584,7 @@ impl VirusProperties { cds_order_preference: vec_of_owned!["HA1", "HA2"], mut_labels: LabelledMutationsConfig::example(), qc: Some(QcConfig::example()), + mutation_patterns: Some(MutationPatternsConfig::example()), general_params: None, alignment_params: None, tree_builder_params: None, From f7764f1fcbc00e7670fc9205fd5a847267593aea Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 26 May 2026 12:00:41 +0200 Subject: [PATCH 2/9] refactor(qc): decouple SNP cluster rule from cluster computation - QC scoring uses pre-computed unfiltered clusters, independent of pattern-specific type filtering - window_size and cluster_cut_off on QcRulesConfigSnpClusters become serde-defaulted for backward compatibility with datasets omitting them --- packages/nextclade/src/qc/qc_config.rs | 14 +- .../nextclade/src/qc/qc_rule_snp_clusters.rs | 121 ++++++++++-------- packages/nextclade/src/qc/qc_run.rs | 4 +- .../nextclade/src/run/nextclade_run_one.rs | 1 + 4 files changed, 82 insertions(+), 58 deletions(-) diff --git a/packages/nextclade/src/qc/qc_config.rs b/packages/nextclade/src/qc/qc_config.rs index 6a4efe6be..a2f329355 100644 --- a/packages/nextclade/src/qc/qc_config.rs +++ b/packages/nextclade/src/qc/qc_config.rs @@ -118,21 +118,25 @@ const fn one() -> OrderedFloat { #[schemars(example = "QcRulesConfigSnpClusters::example")] pub struct QcRulesConfigSnpClusters { pub enabled: bool, - /// Size of the sliding window (in nucleotides) used to count private mutations - pub window_size: usize, - /// Number of private mutations within a window above which the window is flagged as a cluster - pub cluster_cut_off: usize, /// QC score added per detected cluster pub score_weight: OrderedFloat, + + /// Size of the sliding nucleotide window for global SNP cluster detection + #[serde(default)] + pub window_size: usize, + + /// Number of substitutions within a window required to count as a global SNP cluster + #[serde(default)] + pub cluster_cut_off: usize, } impl QcRulesConfigSnpClusters { pub const fn example() -> Self { Self { enabled: true, + score_weight: OrderedFloat(50.0), window_size: 100, cluster_cut_off: 5, - score_weight: OrderedFloat(50.0), } } } diff --git a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs index 9c0a66d24..f75c308a0 100644 --- a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs +++ b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs @@ -1,12 +1,8 @@ -use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; -use crate::analyze::nuc_sub::NucSub; -use crate::coord::position::PositionLike; +use crate::analyze::mutation_patterns::MutationPatternCluster; use crate::qc::qc_config::QcRulesConfigSnpClusters; use crate::qc::qc_run::{QcRule, QcStatus}; -use itertools::Itertools; use num::traits::clamp_min; use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; /// A cluster of private nucleotide substitutions within a genomic window. /// @@ -51,22 +47,24 @@ impl QcRule for QcResultSnpClusters { } pub fn rule_snp_clusters( - private_nuc_mutations: &PrivateNucMutations, + qc_clusters: &[MutationPatternCluster], config: &QcRulesConfigSnpClusters, ) -> Option { if !config.enabled { return None; } - let mut snp_clusters = find_snp_clusters(&private_nuc_mutations.private_substitutions, config); - for cluster in &mut snp_clusters { - cluster.sort_unstable(); - } - - let total_clusters = snp_clusters.len(); + let clustered_snps: Vec = qc_clusters + .iter() + .map(|c| ClusteredSnp { + start: c.start, + end: c.end, + number_of_snps: c.count, + }) + .collect(); - let clustered_snps = process_snp_clusters(snp_clusters); - let total_snps = clustered_snps.iter().map(|cluster| cluster.number_of_snps).sum(); + let total_clusters = clustered_snps.len(); + let total_snps = clustered_snps.iter().map(|c| c.number_of_snps).sum(); let score = clamp_min(total_clusters as f64 * *config.score_weight, 0.0); let status = QcStatus::from_score(score); @@ -79,49 +77,68 @@ pub fn rule_snp_clusters( }) } -fn find_snp_clusters(private_nuc_mutations: &[NucSub], config: &QcRulesConfigSnpClusters) -> Vec> { - let mut current_cluster = VecDeque::::new(); - let mut all_clusters = Vec::>::new(); - let mut previous_pos: isize = -1; - for mutation in private_nuc_mutations { - let pos = mutation.pos.as_isize(); - current_cluster.push_back(pos); - - while current_cluster[0] < (pos - config.window_size as isize) { - current_cluster.pop_front(); +#[cfg(test)] +#[allow( + clippy::float_cmp, + reason = "SNP cluster QC scores are exact products of integer cluster counts and fixed score weights" +)] +mod tests { + use super::*; + use ordered_float::OrderedFloat; + use pretty_assertions::assert_eq; + + fn make_config(enabled: bool, score_weight: f64) -> QcRulesConfigSnpClusters { + QcRulesConfigSnpClusters { + enabled, + score_weight: OrderedFloat(score_weight), + window_size: 0, + cluster_cut_off: 0, } + } - if current_cluster.len() > config.cluster_cut_off { - let n_clusters = all_clusters.len(); - - if !all_clusters.is_empty() && current_cluster.len() > 1 { - let i = n_clusters - 1; - let j = all_clusters[i].len() - 1; - let p = all_clusters[i][j]; - - if p == previous_pos { - all_clusters[i].push(pos); - } else { - all_clusters.push(current_cluster.iter().copied().collect_vec()); - } - } else { - all_clusters.push(current_cluster.iter().copied().collect_vec()); - } + fn make_cluster(start: usize, end: usize, n: usize) -> MutationPatternCluster { + MutationPatternCluster { + start, + end, + count: n, + ..MutationPatternCluster::default() } - previous_pos = pos; } - all_clusters -} + #[test] + fn test_rule_snp_clusters_disabled() { + let result = rule_snp_clusters(&[], &make_config(false, 50.0)); + assert!(result.is_none()); + } + + #[test] + fn test_rule_snp_clusters_empty() { + let result = rule_snp_clusters(&[], &make_config(true, 50.0)).expect("should be Some"); + assert_eq!(0.0, result.score); + assert!(matches!(result.status, QcStatus::Good)); + assert_eq!(0, result.total_snps); + assert!(result.clustered_snps.is_empty()); + } + + #[test] + fn test_rule_snp_clusters_one_cluster() { + let clusters = vec![make_cluster(100, 200, 6)]; + let result = rule_snp_clusters(&clusters, &make_config(true, 50.0)).expect("should be Some"); + assert_eq!(50.0, result.score); + assert!(matches!(result.status, QcStatus::Mediocre)); + assert_eq!(6, result.total_snps); + assert_eq!(1, result.clustered_snps.len()); + assert_eq!(100, result.clustered_snps[0].start); + assert_eq!(200, result.clustered_snps[0].end); + assert_eq!(6, result.clustered_snps[0].number_of_snps); + } -fn process_snp_clusters(snp_clusters: Vec>) -> Vec { - let mut result = Vec::with_capacity(snp_clusters.len()); - for cluster in snp_clusters { - result.push(ClusteredSnp { - start: cluster[0] as usize, - end: cluster[cluster.len() - 1] as usize, - number_of_snps: cluster.len(), - }); + #[test] + fn test_rule_snp_clusters_two_clusters_bad() { + let clusters = vec![make_cluster(100, 200, 6), make_cluster(500, 600, 8)]; + let result = rule_snp_clusters(&clusters, &make_config(true, 50.0)).expect("should be Some"); + assert_eq!(100.0, result.score); + assert!(matches!(result.status, QcStatus::Bad)); + assert_eq!(14, result.total_snps); } - result } diff --git a/packages/nextclade/src/qc/qc_run.rs b/packages/nextclade/src/qc/qc_run.rs index e74640779..c47722fdf 100644 --- a/packages/nextclade/src/qc/qc_run.rs +++ b/packages/nextclade/src/qc/qc_run.rs @@ -1,5 +1,6 @@ use crate::alphabet::nuc::Nuc; use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; +use crate::analyze::mutation_patterns::MutationPatternCluster; use crate::qc::qc_config::QcConfig; use crate::qc::qc_rule_frame_shifts::{QcResultFrameShifts, rule_frame_shifts}; use crate::qc::qc_rule_missing_data::{QcResultMissingData, rule_missing_data}; @@ -82,6 +83,7 @@ pub trait QcRule { pub fn qc_run( private_nuc_mutations: &PrivateNucMutations, + qc_clusters: &[MutationPatternCluster], nucleotide_composition: &BTreeMap, total_missing: usize, translation: &Translation, @@ -92,7 +94,7 @@ pub fn qc_run( missing_data: rule_missing_data(total_missing, &config.missing_data), mixed_sites: rule_mixed_sites(nucleotide_composition, &config.mixed_sites), private_mutations: rule_private_mutations(private_nuc_mutations, &config.private_mutations), - snp_clusters: rule_snp_clusters(private_nuc_mutations, &config.snp_clusters), + snp_clusters: rule_snp_clusters(qc_clusters, &config.snp_clusters), frame_shifts: rule_frame_shifts(frame_shifts, &config.frame_shifts), stop_codons: rule_stop_codons(translation, &config.stop_codons), overall_score: 0.0, diff --git a/packages/nextclade/src/run/nextclade_run_one.rs b/packages/nextclade/src/run/nextclade_run_one.rs index cdc647fc4..cd9cce185 100644 --- a/packages/nextclade/src/run/nextclade_run_one.rs +++ b/packages/nextclade/src/run/nextclade_run_one.rs @@ -434,6 +434,7 @@ pub fn nextclade_run_one( .map(|qc_config| { qc_run( &private_nuc_mutations, + &[], &nucleotide_composition, total_missing, &translation, From 9de030d7b5e9dd7b5bdfa37e41f3ae458c76fd13 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 26 May 2026 12:00:53 +0200 Subject: [PATCH 3/9] feat(nextclade): integrate mutation patterns into pipeline and output - Wire analyze_mutation_patterns() between private mutation calling and QC scoring - Add mutationPatterns to JSON/NDJSON output and 9 CSV/TSV columns with pattern-delimited multi-pattern support - Fix inclusive-to-half-open range conversion in cluster range formatters --- .../src/io/nextclade_csv_column_config.rs | 12 ++ .../nextclade/src/io/nextclade_csv_row.rs | 114 +++++++++++++++++- .../nextclade/src/run/nextclade_run_one.rs | 12 +- packages/nextclade/src/types/outputs.rs | 3 + 4 files changed, 137 insertions(+), 4 deletions(-) diff --git a/packages/nextclade/src/io/nextclade_csv_column_config.rs b/packages/nextclade/src/io/nextclade_csv_column_config.rs index d9a4e2d33..7c1547874 100644 --- a/packages/nextclade/src/io/nextclade_csv_column_config.rs +++ b/packages/nextclade/src/io/nextclade_csv_column_config.rs @@ -26,6 +26,7 @@ pub enum CsvColumnCategory { RelMuts, ErrsWarns, Qc, + MutPatterns, Primers, Dynamic, } @@ -198,6 +199,17 @@ pub static CSV_COLUMN_CONFIG_MAP_DEFAULT: LazyLock = LazyLoc o!("qc.stopCodons.score") => true, o!("qc.stopCodons.status") => true, }, + CsvColumnCategory::MutPatterns => indexmap! { + o!("mutationPatterns.id") => true, + o!("mutationPatterns.name") => true, + o!("mutationPatterns.description") => true, + o!("mutationPatterns.counts.matches") => true, + o!("mutationPatterns.counts.clustered") => true, + o!("mutationPatterns.counts.clusters") => true, + o!("mutationPatterns.eventTypeCounts") => true, + o!("mutationPatterns.clusters") => true, + o!("mutationPatterns.clusterEvents") => true, + }, CsvColumnCategory::Primers => indexmap! { o!("totalPcrPrimerChanges") => true, o!("pcrPrimerChanges") => true, diff --git a/packages/nextclade/src/io/nextclade_csv_row.rs b/packages/nextclade/src/io/nextclade_csv_row.rs index 9dc0b25e7..2c60d8d72 100644 --- a/packages/nextclade/src/io/nextclade_csv_row.rs +++ b/packages/nextclade/src/io/nextclade_csv_row.rs @@ -6,6 +6,9 @@ use crate::analyze::aa_sub::{AaSub, AaSubLabeled}; use crate::analyze::find_aa_motifs::AaMotif; use crate::analyze::find_clade_founder::CladeNodeAttrFounderInfo; use crate::analyze::letter_ranges::{CdsAaRange, NucRange}; +use crate::analyze::mutation_patterns::{ + MutationPatternCluster, MutationPatternEventMatch, MutationPatternEventTypeCount, MutationPatternResults, +}; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::{NucSub, NucSubLabeled}; use crate::analyze::pcr_primer_changes::PcrPrimerChange; @@ -66,6 +69,7 @@ impl NextcladeResultsCsvRow { clade, private_nuc_mutations, private_aa_mutations, + mutation_patterns, missing_cdses, // divergence, coverage, @@ -429,6 +433,50 @@ impl NextcladeResultsCsvRow { "qc.stopCodons.status", qc.stop_codons.as_ref().map(|sc| sc.status.to_string()), )?; + self.add_entry( + "mutationPatterns.id", + &format_mutation_patterns_field(&mutation_patterns.results, |p| p.id.clone()), + )?; + self.add_entry( + "mutationPatterns.name", + &format_mutation_patterns_field(&mutation_patterns.results, |p| p.name.clone()), + )?; + self.add_entry( + "mutationPatterns.description", + &format_mutation_patterns_field(&mutation_patterns.results, |p| { + p.description.as_deref().unwrap_or("").to_owned() + }), + )?; + self.add_entry( + "mutationPatterns.counts.matches", + &format_mutation_patterns_field(&mutation_patterns.results, |p| p.counts.matches.to_string()), + )?; + self.add_entry( + "mutationPatterns.counts.clustered", + &format_mutation_patterns_field(&mutation_patterns.results, |p| p.counts.clustered.to_string()), + )?; + self.add_entry( + "mutationPatterns.counts.clusters", + &format_mutation_patterns_field(&mutation_patterns.results, |p| p.counts.clusters.to_string()), + )?; + self.add_entry( + "mutationPatterns.eventTypeCounts", + &format_mutation_patterns_field(&mutation_patterns.results, |p| { + format_mutation_pattern_event_type_counts(&p.event_type_counts, ARRAY_ITEM_DELIMITER) + }), + )?; + self.add_entry( + "mutationPatterns.clusters", + &format_mutation_patterns_field(&mutation_patterns.results, |p| { + format_mutation_clusters_ranges(&p.clusters, ARRAY_ITEM_DELIMITER) + }), + )?; + self.add_entry( + "mutationPatterns.clusterEvents", + &format_mutation_patterns_field(&mutation_patterns.results, |p| { + format_mutation_clusters_events(&p.clusters, ARRAY_ITEM_DELIMITER) + }), + )?; self.add_entry("isReverseComplement", &is_reverse_complement.to_string())?; self.add_entry("failedCdses", &format_failed_cdses(missing_cdses, ARRAY_ITEM_DELIMITER))?; self.add_entry( @@ -671,13 +719,73 @@ pub fn format_clustered_snps(snps: &[ClusteredSnp], delimiter: &str) -> String { snps .iter() .map(|snp| { - let range = NucRefGlobalRange::from_usize(snp.start, snp.end).to_string(); - let number_of_snps = snp.number_of_snps; - format!("{range}:{number_of_snps}") + let range = NucRefGlobalRange::from_usize(snp.start, snp.end + 1).to_string(); + let count = snp.number_of_snps; + format!("{range}:{count}") + }) + .join(delimiter) +} + +const PATTERN_DELIMITER: &str = "|"; + +#[inline] +fn format_mutation_patterns_field( + patterns: &[MutationPatternResults], + fmt: impl Fn(&MutationPatternResults) -> String, +) -> String { + if patterns.len() <= 1 { + return patterns.first().map_or_else(String::new, &fmt); + } + patterns.iter().map(&fmt).join(PATTERN_DELIMITER) +} + +#[inline] +pub fn format_mutation_pattern_event_type_counts(counts: &[MutationPatternEventTypeCount], delimiter: &str) -> String { + counts + .iter() + .map(|c| match c { + MutationPatternEventTypeCount::NucSubstitution(c) => { + format!( + "nucSubstitution:{}>{}:{}", + from_nuc(c.ref_nuc), + from_nuc(c.qry_nuc), + c.count + ) + } }) .join(delimiter) } +#[inline] +pub fn format_mutation_clusters_ranges(clusters: &[MutationPatternCluster], delimiter: &str) -> String { + clusters + .iter() + .map(|c| { + let range = NucRefGlobalRange::from_usize(c.start, c.end + 1).to_string(); + format!("{range}:{}", c.count) + }) + .join(delimiter) +} + +#[inline] +pub fn format_mutation_clusters_events(clusters: &[MutationPatternCluster], delimiter: &str) -> String { + clusters + .iter() + .map(|c| { + let range = NucRefGlobalRange::from_usize(c.start, c.end + 1).to_string(); + let events = c.events.iter().map(format_mutation_pattern_event).join(";"); + format!("{range}:{events}") + }) + .join(delimiter) +} + +#[inline] +fn format_mutation_pattern_event(event: &MutationPatternEventMatch) -> String { + match event { + MutationPatternEventMatch::NucSubstitution(event) => format!("nucSubstitution:{}", event.substitution.sub), + } +} + #[inline] pub fn format_stop_codons(stop_codons: &[StopCodonLocation], delimiter: &str) -> String { stop_codons diff --git a/packages/nextclade/src/run/nextclade_run_one.rs b/packages/nextclade/src/run/nextclade_run_one.rs index cd9cce185..a90aa967b 100644 --- a/packages/nextclade/src/run/nextclade_run_one.rs +++ b/packages/nextclade/src/run/nextclade_run_one.rs @@ -26,6 +26,7 @@ use crate::analyze::letter_composition::get_letter_composition; use crate::analyze::letter_ranges::{ CdsAaRange, NucRange, find_aa_letter_ranges, find_letter_ranges, find_letter_ranges_by, }; +use crate::analyze::mutation_patterns::analyze_mutation_patterns; use crate::analyze::nuc_alignment::NucAlignment; use crate::analyze::nuc_changes::{FindNucChangesOutput, find_nuc_changes}; use crate::analyze::nuc_del::NucDelRange; @@ -428,13 +429,21 @@ pub fn nextclade_run_one( let aa_motifs = find_aa_motifs(&virus_properties.aa_motifs, &translation)?; let aa_motifs_changes = find_aa_motifs_changes(aa_motifs_ref, &aa_motifs, ref_translation, &translation)?; + let mutation_pattern_analysis = analyze_mutation_patterns( + &private_nuc_mutations, + ref_seq, + virus_properties.mutation_patterns.as_ref(), + virus_properties.qc.as_ref().map(|qc| &qc.snp_clusters), + )?; + let mutation_patterns = mutation_pattern_analysis.results; + let qc = virus_properties .qc .as_ref() .map(|qc_config| { qc_run( &private_nuc_mutations, - &[], + &mutation_pattern_analysis.qc_clusters, &nucleotide_composition, total_missing, &translation, @@ -512,6 +521,7 @@ pub fn nextclade_run_one( clade, private_nuc_mutations, private_aa_mutations, + mutation_patterns, clade_founder_info, clade_node_attr_founder_info, ref_nodes: ref_nodes.to_owned(), diff --git a/packages/nextclade/src/types/outputs.rs b/packages/nextclade/src/types/outputs.rs index 255290228..c7791fc1c 100644 --- a/packages/nextclade/src/types/outputs.rs +++ b/packages/nextclade/src/types/outputs.rs @@ -10,6 +10,7 @@ use crate::analyze::find_private_nuc_mutations::PrivateNucMutations; use crate::analyze::find_relative_aa_mutations::RelativeAaMutations; use crate::analyze::find_relative_nuc_mutations::RelativeNucMutations; use crate::analyze::letter_ranges::{CdsAaRange, NucRange}; +use crate::analyze::mutation_patterns::MutationPatternsResults; use crate::analyze::nuc_del::NucDelRange; use crate::analyze::nuc_sub::NucSub; use crate::analyze::pcr_primer_changes::PcrPrimerChange; @@ -137,6 +138,8 @@ pub struct NextcladeOutputs { pub private_nuc_mutations: PrivateNucMutations, /// Per-CDS amino acid mutations not shared with the nearest reference tree node pub private_aa_mutations: BTreeMap, + /// Per-type mutation statistics, local reference context, and detected mutation clusters + pub mutation_patterns: MutationPatternsResults, /// Mutations relative to the clade founder node pub clade_founder_info: Option, /// Per-attribute mutations relative to founder nodes, keyed by attribute name From 85d2ff78f845e6d814e09efacd35a1b2a5ea5438 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 26 May 2026 12:01:04 +0200 Subject: [PATCH 4/9] feat(web): add mutation cluster markers and pattern tooltips - Cluster markers rendered as orange-bordered SVG rectangles with configurable height (Off/Top/Bottom/Full) - Mutation column tooltip groups events by configured pattern, showing cluster ranges and nucleotide badges --- .../components/Results/ColumnMutations.tsx | 117 +++++++++++++++- .../SequenceView/SequenceMarkerCluster.tsx | 130 ++++++++++++++++++ .../SequenceView/SequenceViewAbsolute.tsx | 18 ++- .../SequenceView/SequenceViewRelative.tsx | 19 ++- .../components/Settings/SeqViewSettings.tsx | 17 +++ .../src/helpers/formatQCSNPClusters.ts | 18 ++- .../src/state/seqViewSettings.state.ts | 6 + 7 files changed, 315 insertions(+), 10 deletions(-) create mode 100644 packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx diff --git a/packages/nextclade-web/src/components/Results/ColumnMutations.tsx b/packages/nextclade-web/src/components/Results/ColumnMutations.tsx index 561b3df83..ac16154a8 100644 --- a/packages/nextclade-web/src/components/Results/ColumnMutations.tsx +++ b/packages/nextclade-web/src/components/Results/ColumnMutations.tsx @@ -1,16 +1,129 @@ import React, { useCallback, useMemo, useState } from 'react' import { useRecoilValue } from 'recoil' +import styled from 'styled-components' import { REF_NODE_CLADE_FOUNDER, REF_NODE_PARENT, REF_NODE_ROOT } from 'src/constants' import { findCladeNodeAttrFounderInfo, getAaMutations, getNucMutations } from 'src/helpers/relativeMuts' import { viewedDatasetNameAtom } from 'src/state/dataset.state' import { currentRefNodeNameAtom, refNodesAtom } from 'src/state/results.state' import type { ColumnCladeProps } from 'src/components/Results/ColumnClade' +import type { AnalysisResult } from 'src/types' import { getSafeId } from 'src/helpers/getSafeId' -import { TableSlim } from 'src/components/Common/TableSlim' import { Tooltip } from 'src/components/Results/Tooltip' import { ListOfNucMuts } from 'src/components/Results/ListOfNucMuts' import { ListOfAaMuts } from 'src/components/Results/ListOfAaMuts' +import { NucleotideMutationBadge } from 'src/components/Common/MutationBadge' import { useTranslationSafe } from 'src/helpers/useTranslationSafe' +import { TableSlim } from 'src/components/Common/TableSlim' +import type { MutationPatternEventMatch } from 'src/gen/_SchemaRoot' + +const PatternList = styled.div` + border-top: 1px solid #dee2e6; + margin-top: 0.5rem; + padding-top: 0.5rem; +` + +const PatternSection = styled.section` + border-top: 1px solid #343a40; + margin-top: 0.7rem; + padding-top: 0.6rem; + + &:first-child { + border-top: none; + margin-top: 0; + padding-top: 0; + } +` + +const PatternName = styled.div` + font-weight: 700; +` + +const PatternDescription = styled.div` + color: ${(props) => props.theme.gray600}; + font-size: 0.85em; + margin-bottom: 0.35rem; +` + +const ClusterCard = styled.div` + background: rgba(255, 140, 0, 0.06); + border: 1px solid rgba(255, 140, 0, 0.2); + border-radius: 3px; + padding: 0.25rem 0.4rem; + + &:not(:last-child) { + margin-bottom: 0.35rem; + } +` + +const ClusterTitle = styled.div` + font-weight: 700; + font-size: 0.85em; + margin-bottom: 2px; +` + +const ClusterBadgeGrid = styled.div` + display: flex; + flex-wrap: wrap; + gap: 2px; +` + +function MutationPatternsSection({ analysisResult }: { analysisResult: AnalysisResult }) { + const { t } = useTranslationSafe() + const { mutationPatterns, privateNucMutations } = analysisResult + + if (!mutationPatterns?.results?.length || privateNucMutations.privateSubstitutions.length === 0) { + return null + } + + const visiblePatterns = mutationPatterns.results.filter((pattern) => pattern.clusters.length > 0) + + if (visiblePatterns.length === 0) { + return null + } + + return ( + + {visiblePatterns.map((pattern) => ( + + {pattern.name || t('Mutation pattern')} + {pattern.description && {pattern.description}} + + {pattern.clusters.map((cluster) => ( + + + {t('{{start}}-{{end}} ({{count}} events)', { + start: cluster.start + 1, + end: cluster.end + 1, + count: cluster.count, + })} + + + {cluster.events.map((event) => ( + + ))} + + + ))} + + ))} + + ) +} + +function MutationPatternEventBadge({ event }: { event: MutationPatternEventMatch }) { + switch (event.type) { + case 'nucSubstitution': + return + } +} + +function mutationPatternEventKey(event: MutationPatternEventMatch): string { + switch (event.type) { + case 'nucSubstitution': + return `${event.type}:${event.pos}:${event.refNuc}:${event.qryNuc}` + } + throw new Error(`Unknown mutation pattern event type: ${event.type}`) +} export function ColumnMutations({ analysisResult }: ColumnCladeProps) { const { t } = useTranslationSafe() @@ -117,6 +230,8 @@ export function ColumnMutations({ analysisResult }: ColumnCladeProps) { + + ) diff --git a/packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx b/packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx new file mode 100644 index 000000000..8a46587a3 --- /dev/null +++ b/packages/nextclade-web/src/components/SequenceView/SequenceMarkerCluster.tsx @@ -0,0 +1,130 @@ +import React, { SVGProps, useCallback, useMemo, useState } from 'react' +import { useRecoilValue } from 'recoil' +import styled from 'styled-components' + +import { useTranslationSafe as useTranslation } from 'src/helpers/useTranslationSafe' + +import { Tooltip } from 'src/components/Results/Tooltip' +import { NucleotideMutationBadge } from 'src/components/Common/MutationBadge' +import { getSafeId } from 'src/helpers/getSafeId' +import { + SeqMarkerHeightState, + getSeqMarkerDims, + seqMarkerClusterHeightStateAtom, +} from 'src/state/seqViewSettings.state' +import type { MutationPatternEventMatch } from 'src/gen/_SchemaRoot' + +const CLUSTER_FILL = 'rgba(255, 140, 0, 0.12)' +const CLUSTER_STROKE = '#e06000' +const CLUSTER_MIN_WIDTH_PX = 12 + +const ClusterBadgeGrid = styled.div` + display: flex; + flex-wrap: wrap; + gap: 2px; + margin-top: 4px; +` + +const ClusterDescription = styled.div` + margin-top: 4px; + font-size: 0.85em; + color: #666; +` + +interface ClusterProps { + start: number + end: number + count: number + events: MutationPatternEventMatch[] +} + +export interface SequenceMarkerClusterProps extends SVGProps { + index: number + seqName: string + cluster: ClusterProps + pixelsPerBase: number + description?: string +} + +function SequenceMarkerClusterUnmemoed({ + index, + seqName, + cluster, + pixelsPerBase, + description, + ...rest +}: SequenceMarkerClusterProps) { + const { t } = useTranslation() + const [showTooltip, setShowTooltip] = useState(false) + const onMouseEnter = useCallback(() => setShowTooltip(true), []) + const onMouseLeave = useCallback(() => setShowTooltip(false), []) + + const seqMarkerClusterHeightState = useRecoilValue(seqMarkerClusterHeightStateAtom) + const { y, height } = useMemo(() => getSeqMarkerDims(seqMarkerClusterHeightState), [seqMarkerClusterHeightState]) + + if (seqMarkerClusterHeightState === SeqMarkerHeightState.Off) { + return null + } + + const { start, end, count, events } = cluster + + const id = getSafeId('cluster-marker', { index, seqName, begin: start, end }) + + let width = (end - start + 1) * pixelsPerBase + width = Math.max(width, CLUSTER_MIN_WIDTH_PX) + const halfNuc = Math.max(pixelsPerBase, CLUSTER_MIN_WIDTH_PX) / 2 + const x = start * pixelsPerBase - halfNuc + + return ( + + + +
+ + {t('Mutation cluster: {{start}}-{{end}} ({{count}} events)', { + start: start + 1, + end: end + 1, + count, + })} + +
+ {events.length > 0 && ( + + {events.map((event) => ( + + ))} + + )} + {description && {description}} +
+
+ ) +} + +export const SequenceMarkerCluster = React.memo(SequenceMarkerClusterUnmemoed) + +function MutationPatternEventBadge({ event }: { event: MutationPatternEventMatch }) { + switch (event.type) { + case 'nucSubstitution': + return + } +} + +function mutationPatternEventKey(event: MutationPatternEventMatch): string { + switch (event.type) { + case 'nucSubstitution': + return `${event.type}:${event.pos}:${event.refNuc}:${event.qryNuc}` + } + throw new Error(`Unknown mutation pattern event type: ${event.type}`) +} diff --git a/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx b/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx index ea164a01f..b20c21a44 100644 --- a/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx +++ b/packages/nextclade-web/src/components/SequenceView/SequenceViewAbsolute.tsx @@ -11,6 +11,7 @@ import { SequenceMarkerGap } from './SequenceMarkerGap' import { SequenceMarkerMissing } from './SequenceMarkerMissing' import { SequenceMarkerMutation } from './SequenceMarkerMutation' import { SequenceMarkerUnsequencedEnd, SequenceMarkerUnsequencedStart } from './SequenceMarkerUnsequenced' +import { SequenceMarkerCluster } from './SequenceMarkerCluster' import { SequenceMarkerFrameShift } from './SequenceMarkerFrameShift' import { SequenceMarkerInsertion } from './SequenceMarkerInsertion' import { SequenceViewCoverageWrapper, SequenceViewCoverageText, SequenceViewSVG } from './SequenceViewStyles' @@ -32,6 +33,7 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr insertions, nucToAaMuts, nonACGTNs, + mutationPatterns, } = sequence const { t } = useTranslationSafe() @@ -103,6 +105,19 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr ) }) + const clusterViews = (mutationPatterns?.results ?? []).flatMap((pattern, patternIndex) => + (pattern.clusters ?? []).map((cluster) => ( + + )), + ) + const frameShiftMarkers = frameShifts.map((frameShift) => ( na.begin).join('-')}`} @@ -114,7 +129,7 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr )) const totalMarkers = - mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + clusterViews.length if (totalMarkers > maxNucMarkers) { return ( @@ -164,6 +179,7 @@ export function SequenceViewAbsolute({ sequence, width }: SequenceViewAbsolutePr pixelsPerBase={pixelsPerBase} /> {frameShiftMarkers} + {clusterViews} ) } diff --git a/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx b/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx index de99b7c56..f82001f9b 100644 --- a/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx +++ b/packages/nextclade-web/src/components/SequenceView/SequenceViewRelative.tsx @@ -11,6 +11,7 @@ import { SequenceMarkerMutation } from './SequenceMarkerMutation' import { SequenceMarkerGap } from './SequenceMarkerGap' import { SequenceMarkerAmbiguous } from './SequenceMarkerAmbiguous' import { SequenceMarkerMissing } from './SequenceMarkerMissing' +import { SequenceMarkerCluster } from './SequenceMarkerCluster' import { SequenceMarkerFrameShift } from './SequenceMarkerFrameShift' import { SequenceMarkerInsertion } from './SequenceMarkerInsertion' import { SequenceMarkerUnsequencedEnd, SequenceMarkerUnsequencedStart } from './SequenceMarkerUnsequenced' @@ -23,7 +24,7 @@ export interface SequenceViewRelativeProps { } export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceViewRelativeProps) { - const { index, seqName, missing, alignmentRange, frameShifts, insertions, nucToAaMuts, nonACGTNs } = sequence + const { index, seqName, missing, alignmentRange, frameShifts, insertions, nucToAaMuts, nonACGTNs, mutationPatterns } = sequence const { t } = useTranslationSafe() const maxNucMarkers = useRecoilValue(maxNucMarkersAtom) @@ -93,6 +94,19 @@ export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceV /> )) + const clusterViews = (mutationPatterns?.results ?? []).flatMap((pattern, patternIndex) => + (pattern.clusters ?? []).map((cluster) => ( + + )), + ) + const frameShiftMarkers = frameShifts.map((frameShift) => ( na.begin).join('-')}`} @@ -104,7 +118,7 @@ export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceV )) const totalMarkers = - mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + mutationViews.length + deletionViews.length + missingViews.length + frameShiftMarkers.length + insertionViews.length + clusterViews.length if (totalMarkers > maxNucMarkers) { return ( @@ -155,6 +169,7 @@ export function SequenceViewRelative({ sequence, width, refNodeName }: SequenceV pixelsPerBase={pixelsPerBase} /> {frameShiftMarkers} + {clusterViews} ) } diff --git a/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx b/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx index 34fc4e6b5..4aa6ac454 100644 --- a/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx +++ b/packages/nextclade-web/src/components/Settings/SeqViewSettings.tsx @@ -12,6 +12,7 @@ import { SeqMarkerState, maxNucMarkersAtom, seqMarkerAmbiguousHeightStateAtom, + seqMarkerClusterHeightStateAtom, seqMarkerFrameShiftStateAtom, seqMarkerGapHeightStateAtom, seqMarkerHeightStateFromString, @@ -78,6 +79,10 @@ export function SeqViewSettings() { seqMarkerUnsequencedHeightStateAtom, ) + const [seqMarkerClusterHeightState, setSeqMarkerClusterHeightState] = useSeqMarkerHeightState( + seqMarkerClusterHeightStateAtom, + ) + const [seqMarkerInsertionState, setSeqMarkerInsertionState] = useSeqMarkerState(seqMarkerInsertionStateAtom) const [seqMarkerFrameShiftState, setSeqMarkerFrameShiftState] = useSeqMarkerState(seqMarkerFrameShiftStateAtom) @@ -174,6 +179,18 @@ export function SeqViewSettings() { + + + +