|
| 1 | +//! This module contains the Smith-Waterman algorithm implementation for local sequence alignment. |
| 2 | +//! |
| 3 | +//! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining |
| 4 | +//! similar regions between two sequences (nucleotide or protein sequences). It is particularly |
| 5 | +//! useful in bioinformatics for identifying optimal local alignments. |
| 6 | +//! |
| 7 | +//! # Algorithm Overview |
| 8 | +//! |
| 9 | +//! The algorithm works by: |
| 10 | +//! 1. Creating a scoring matrix where each cell represents the maximum alignment score |
| 11 | +//! ending at that position |
| 12 | +//! 2. Using match, mismatch, and gap penalties to calculate scores |
| 13 | +//! 3. Allowing scores to reset to 0 (ensuring local rather than global alignment) |
| 14 | +//! 4. Tracing back from the highest scoring position to reconstruct the alignment |
| 15 | +//! |
| 16 | +//! # Time Complexity |
| 17 | +//! |
| 18 | +//! O(m * n) where m and n are the lengths of the two sequences |
| 19 | +//! |
| 20 | +//! # Space Complexity |
| 21 | +//! |
| 22 | +//! O(m * n) for the scoring matrix |
| 23 | +//! |
| 24 | +//! # References |
| 25 | +//! |
| 26 | +//! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5) |
| 27 | +//! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) |
| 28 | +
|
| 29 | +use std::cmp::max; |
| 30 | + |
| 31 | +/// Calculates the score for a character pair based on match, mismatch, or gap scoring. |
| 32 | +/// |
| 33 | +/// # Arguments |
| 34 | +/// |
| 35 | +/// * `source_char` - Character from the source sequence |
| 36 | +/// * `target_char` - Character from the target sequence |
| 37 | +/// * `match_score` - Score awarded for matching characters (typically positive) |
| 38 | +/// * `mismatch_score` - Score penalty for mismatching characters (typically negative) |
| 39 | +/// * `gap_score` - Score penalty for gaps (typically negative) |
| 40 | +/// |
| 41 | +/// # Returns |
| 42 | +/// |
| 43 | +/// The calculated score for the character pair |
| 44 | +/// |
| 45 | +/// # Examples |
| 46 | +/// |
| 47 | +/// ``` |
| 48 | +/// use the_algorithms_rust::dynamic_programming::score_function; |
| 49 | +/// |
| 50 | +/// let score = score_function('A', 'A', 1, -1, -2); |
| 51 | +/// assert_eq!(score, 1); // Match |
| 52 | +/// |
| 53 | +/// let score = score_function('A', 'C', 1, -1, -2); |
| 54 | +/// assert_eq!(score, -1); // Mismatch |
| 55 | +/// |
| 56 | +/// let score = score_function('-', 'A', 1, -1, -2); |
| 57 | +/// assert_eq!(score, -2); // Gap |
| 58 | +/// ``` |
| 59 | +pub fn score_function( |
| 60 | + source_char: char, |
| 61 | + target_char: char, |
| 62 | + match_score: i32, |
| 63 | + mismatch_score: i32, |
| 64 | + gap_score: i32, |
| 65 | +) -> i32 { |
| 66 | + if source_char == '-' || target_char == '-' { |
| 67 | + gap_score |
| 68 | + } else if source_char == target_char { |
| 69 | + match_score |
| 70 | + } else { |
| 71 | + mismatch_score |
| 72 | + } |
| 73 | +} |
| 74 | + |
| 75 | +/// Performs the Smith-Waterman local sequence alignment algorithm. |
| 76 | +/// |
| 77 | +/// This function creates a scoring matrix using dynamic programming to find the |
| 78 | +/// optimal local alignment between two sequences. The algorithm is case-insensitive. |
| 79 | +/// |
| 80 | +/// # Arguments |
| 81 | +/// |
| 82 | +/// * `query` - The query sequence (e.g., DNA, protein) |
| 83 | +/// * `subject` - The subject sequence to align against |
| 84 | +/// * `match_score` - Score for matching characters (default: 1) |
| 85 | +/// * `mismatch_score` - Penalty for mismatching characters (default: -1) |
| 86 | +/// * `gap_score` - Penalty for gaps/indels (default: -2) |
| 87 | +/// |
| 88 | +/// # Returns |
| 89 | +/// |
| 90 | +/// A 2D vector representing the dynamic programming scoring matrix |
| 91 | +/// |
| 92 | +/// # Examples |
| 93 | +/// |
| 94 | +/// ``` |
| 95 | +/// use the_algorithms_rust::dynamic_programming::smith_waterman; |
| 96 | +/// |
| 97 | +/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); |
| 98 | +/// assert_eq!(score_matrix.len(), 5); // query length + 1 |
| 99 | +/// assert_eq!(score_matrix[0].len(), 3); // subject length + 1 |
| 100 | +/// ``` |
| 101 | +pub fn smith_waterman( |
| 102 | + query: &str, |
| 103 | + subject: &str, |
| 104 | + match_score: i32, |
| 105 | + mismatch_score: i32, |
| 106 | + gap_score: i32, |
| 107 | +) -> Vec<Vec<i32>> { |
| 108 | + let query_upper: Vec<char> = query.to_uppercase().chars().collect(); |
| 109 | + let subject_upper: Vec<char> = subject.to_uppercase().chars().collect(); |
| 110 | + |
| 111 | + let m = query_upper.len(); |
| 112 | + let n = subject_upper.len(); |
| 113 | + |
| 114 | + // Initialize scoring matrix with zeros |
| 115 | + let mut score = vec![vec![0; n + 1]; m + 1]; |
| 116 | + |
| 117 | + // Fill the scoring matrix using dynamic programming |
| 118 | + for i in 1..=m { |
| 119 | + for j in 1..=n { |
| 120 | + // Calculate score for match/mismatch |
| 121 | + let match_or_mismatch = score[i - 1][j - 1] |
| 122 | + + score_function( |
| 123 | + query_upper[i - 1], |
| 124 | + subject_upper[j - 1], |
| 125 | + match_score, |
| 126 | + mismatch_score, |
| 127 | + gap_score, |
| 128 | + ); |
| 129 | + |
| 130 | + // Calculate score for deletion (gap in subject) |
| 131 | + let delete = score[i - 1][j] + gap_score; |
| 132 | + |
| 133 | + // Calculate score for insertion (gap in query) |
| 134 | + let insert = score[i][j - 1] + gap_score; |
| 135 | + |
| 136 | + // Take maximum of all options, but never go below 0 (local alignment) |
| 137 | + score[i][j] = max(0, max(match_or_mismatch, max(delete, insert))); |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + score |
| 142 | +} |
| 143 | + |
| 144 | +/// Performs traceback on the Smith-Waterman score matrix to reconstruct the optimal alignment. |
| 145 | +/// |
| 146 | +/// This function starts from the highest scoring cell and traces back through the matrix |
| 147 | +/// to reconstruct the aligned sequences. The traceback stops when a cell with score 0 |
| 148 | +/// is encountered. |
| 149 | +/// |
| 150 | +/// # Arguments |
| 151 | +/// |
| 152 | +/// * `score` - The score matrix from the Smith-Waterman algorithm |
| 153 | +/// * `query` - Original query sequence used in alignment |
| 154 | +/// * `subject` - Original subject sequence used in alignment |
| 155 | +/// |
| 156 | +/// # Returns |
| 157 | +/// |
| 158 | +/// A String containing the two aligned sequences separated by a newline, |
| 159 | +/// or an empty string if no significant alignment is found |
| 160 | +/// |
| 161 | +/// # Examples |
| 162 | +/// |
| 163 | +/// ``` |
| 164 | +/// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback}; |
| 165 | +/// |
| 166 | +/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); |
| 167 | +/// let alignment = traceback(&score_matrix, "ACAC", "CA"); |
| 168 | +/// assert_eq!(alignment, "CA\nCA"); |
| 169 | +/// ``` |
| 170 | +pub fn traceback(score: &[Vec<i32>], query: &str, subject: &str) -> String { |
| 171 | + let query_upper: Vec<char> = query.to_uppercase().chars().collect(); |
| 172 | + let subject_upper: Vec<char> = subject.to_uppercase().chars().collect(); |
| 173 | + |
| 174 | + // Find the cell with maximum score |
| 175 | + let mut max_value = i32::MIN; |
| 176 | + let (mut i_max, mut j_max) = (0, 0); |
| 177 | + |
| 178 | + for (i, row) in score.iter().enumerate() { |
| 179 | + for (j, &value) in row.iter().enumerate() { |
| 180 | + if value > max_value { |
| 181 | + max_value = value; |
| 182 | + i_max = i; |
| 183 | + j_max = j; |
| 184 | + } |
| 185 | + } |
| 186 | + } |
| 187 | + |
| 188 | + // If no significant alignment found, return empty string |
| 189 | + if i_max == 0 || j_max == 0 { |
| 190 | + return String::new(); |
| 191 | + } |
| 192 | + |
| 193 | + // Traceback from the maximum scoring cell |
| 194 | + let (mut i, mut j) = (i_max, j_max); |
| 195 | + let mut align1 = String::new(); |
| 196 | + let mut align2 = String::new(); |
| 197 | + |
| 198 | + // Continue tracing back until we hit a cell with score 0 |
| 199 | + while i > 0 && j > 0 && score[i][j] > 0 { |
| 200 | + let current_score = score[i][j]; |
| 201 | + |
| 202 | + // Check if we came from diagonal (match/mismatch) |
| 203 | + if current_score |
| 204 | + == score[i - 1][j - 1] |
| 205 | + + score_function(query_upper[i - 1], subject_upper[j - 1], 1, -1, -2) |
| 206 | + { |
| 207 | + align1.insert(0, query_upper[i - 1]); |
| 208 | + align2.insert(0, subject_upper[j - 1]); |
| 209 | + i -= 1; |
| 210 | + j -= 1; |
| 211 | + } |
| 212 | + // Check if we came from above (deletion/gap in subject) |
| 213 | + else if current_score == score[i - 1][j] - 2 { |
| 214 | + align1.insert(0, query_upper[i - 1]); |
| 215 | + align2.insert(0, '-'); |
| 216 | + i -= 1; |
| 217 | + } |
| 218 | + // Otherwise we came from left (insertion/gap in query) |
| 219 | + else { |
| 220 | + align1.insert(0, '-'); |
| 221 | + align2.insert(0, subject_upper[j - 1]); |
| 222 | + j -= 1; |
| 223 | + } |
| 224 | + } |
| 225 | + |
| 226 | + format!("{align1}\n{align2}") |
| 227 | +} |
| 228 | + |
| 229 | +#[cfg(test)] |
| 230 | +mod tests { |
| 231 | + use super::*; |
| 232 | + |
| 233 | + macro_rules! smith_waterman_tests { |
| 234 | + ($($name:ident: $test_cases:expr,)*) => { |
| 235 | + $( |
| 236 | + #[test] |
| 237 | + fn $name() { |
| 238 | + let (query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases; |
| 239 | + assert_eq!(smith_waterman(query, subject, match_score, mismatch_score, gap_score), expected); |
| 240 | + } |
| 241 | + )* |
| 242 | + } |
| 243 | + } |
| 244 | + |
| 245 | + macro_rules! traceback_tests { |
| 246 | + ($($name:ident: $test_cases:expr,)*) => { |
| 247 | + $( |
| 248 | + #[test] |
| 249 | + fn $name() { |
| 250 | + let (score, query, subject, expected) = $test_cases; |
| 251 | + assert_eq!(traceback(&score, query, subject), expected); |
| 252 | + } |
| 253 | + )* |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + smith_waterman_tests! { |
| 258 | + test_acac_ca: ("ACAC", "CA", 1, -1, -2, vec![ |
| 259 | + vec![0, 0, 0], |
| 260 | + vec![0, 0, 1], |
| 261 | + vec![0, 1, 0], |
| 262 | + vec![0, 0, 2], |
| 263 | + vec![0, 1, 0], |
| 264 | + ]), |
| 265 | + test_agt_agt: ("AGT", "AGT", 1, -1, -2, vec![ |
| 266 | + vec![0, 0, 0, 0], |
| 267 | + vec![0, 1, 0, 0], |
| 268 | + vec![0, 0, 2, 0], |
| 269 | + vec![0, 0, 0, 3], |
| 270 | + ]), |
| 271 | + test_agt_gta: ("AGT", "GTA", 1, -1, -2, vec![ |
| 272 | + vec![0, 0, 0, 0], |
| 273 | + vec![0, 0, 0, 1], |
| 274 | + vec![0, 1, 0, 0], |
| 275 | + vec![0, 0, 2, 0], |
| 276 | + ]), |
| 277 | + test_agt_g: ("AGT", "G", 1, -1, -2, vec![ |
| 278 | + vec![0, 0], |
| 279 | + vec![0, 0], |
| 280 | + vec![0, 1], |
| 281 | + vec![0, 0], |
| 282 | + ]), |
| 283 | + test_g_agt: ("G", "AGT", 1, -1, -2, vec![ |
| 284 | + vec![0, 0, 0, 0], |
| 285 | + vec![0, 0, 1, 0], |
| 286 | + ]), |
| 287 | + test_empty_query: ("", "CA", 1, -1, -2, vec![vec![0, 0, 0]]), |
| 288 | + test_empty_subject: ("ACAC", "", 1, -1, -2, vec![vec![0], vec![0], vec![0], vec![0], vec![0]]), |
| 289 | + test_both_empty: ("", "", 1, -1, -2, vec![vec![0]]), |
| 290 | + } |
| 291 | + |
| 292 | + traceback_tests! { |
| 293 | + test_traceback_acac_ca: ( |
| 294 | + vec![ |
| 295 | + vec![0, 0, 0], |
| 296 | + vec![0, 0, 1], |
| 297 | + vec![0, 1, 0], |
| 298 | + vec![0, 0, 2], |
| 299 | + vec![0, 1, 0], |
| 300 | + ], |
| 301 | + "ACAC", |
| 302 | + "CA", |
| 303 | + "CA\nCA", |
| 304 | + ), |
| 305 | + test_traceback_agt_agt: ( |
| 306 | + vec![ |
| 307 | + vec![0, 0, 0, 0], |
| 308 | + vec![0, 1, 0, 0], |
| 309 | + vec![0, 0, 2, 0], |
| 310 | + vec![0, 0, 0, 3], |
| 311 | + ], |
| 312 | + "AGT", |
| 313 | + "AGT", |
| 314 | + "AGT\nAGT", |
| 315 | + ), |
| 316 | + test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", ""), |
| 317 | + } |
| 318 | + |
| 319 | + #[test] |
| 320 | + fn test_score_function_match() { |
| 321 | + assert_eq!(score_function('A', 'A', 1, -1, -2), 1); |
| 322 | + assert_eq!(score_function('G', 'G', 2, -1, -1), 2); |
| 323 | + } |
| 324 | + |
| 325 | + #[test] |
| 326 | + fn test_score_function_mismatch() { |
| 327 | + assert_eq!(score_function('A', 'C', 1, -1, -2), -1); |
| 328 | + assert_eq!(score_function('G', 'T', 1, -2, -1), -2); |
| 329 | + } |
| 330 | + |
| 331 | + #[test] |
| 332 | + fn test_score_function_gap() { |
| 333 | + assert_eq!(score_function('-', 'A', 1, -1, -2), -2); |
| 334 | + assert_eq!(score_function('A', '-', 1, -1, -2), -2); |
| 335 | + } |
| 336 | + |
| 337 | + #[test] |
| 338 | + fn test_case_insensitive() { |
| 339 | + let result1 = smith_waterman("acac", "CA", 1, -1, -2); |
| 340 | + let result2 = smith_waterman("ACAC", "ca", 1, -1, -2); |
| 341 | + let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2); |
| 342 | + |
| 343 | + assert_eq!(result1, result2); |
| 344 | + assert_eq!(result2, result3); |
| 345 | + } |
| 346 | +} |
0 commit comments