Skip to content

Commit 91fa3f4

Browse files
feat: add Smith-Waterman algorithm for local sequence alignment
1 parent eaf895d commit 91fa3f4

3 files changed

Lines changed: 349 additions & 0 deletions

File tree

DIRECTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
* [Minimum Cost Path](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/minimum_cost_path.rs)
104104
* [Optimal Bst](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/optimal_bst.rs)
105105
* [Rod Cutting](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/rod_cutting.rs)
106+
* [Smith-Waterman](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/smith_waterman.rs)
106107
* [Snail](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/snail.rs)
107108
* [Subset Generation](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/subset_generation.rs)
108109
* [Task Assignment](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/task_assignment.rs)

src/dynamic_programming/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ mod maximum_subarray;
1414
mod minimum_cost_path;
1515
mod optimal_bst;
1616
mod rod_cutting;
17+
mod smith_waterman;
1718
mod snail;
1819
mod subset_generation;
1920
mod subset_sum;
@@ -45,6 +46,7 @@ pub use self::maximum_subarray::maximum_subarray;
4546
pub use self::minimum_cost_path::minimum_cost_path;
4647
pub use self::optimal_bst::optimal_search_tree;
4748
pub use self::rod_cutting::rod_cut;
49+
pub use self::smith_waterman::{score_function, smith_waterman, traceback};
4850
pub use self::snail::snail;
4951
pub use self::subset_generation::list_subset;
5052
pub use self::subset_sum::is_sum_subset;
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
//! This module contains the Smith-Waterman algorithm implementation for local sequence alignment.
2+
//!
3+
//! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining
4+
//! similar regions between two sequences (nucleotide or protein sequences). It is particularly
5+
//! useful in bioinformatics for identifying optimal local alignments.
6+
//!
7+
//! # Algorithm Overview
8+
//!
9+
//! The algorithm works by:
10+
//! 1. Creating a scoring matrix where each cell represents the maximum alignment score
11+
//! ending at that position
12+
//! 2. Using match, mismatch, and gap penalties to calculate scores
13+
//! 3. Allowing scores to reset to 0 (ensuring local rather than global alignment)
14+
//! 4. Tracing back from the highest scoring position to reconstruct the alignment
15+
//!
16+
//! # Time Complexity
17+
//!
18+
//! O(m * n) where m and n are the lengths of the two sequences
19+
//!
20+
//! # Space Complexity
21+
//!
22+
//! O(m * n) for the scoring matrix
23+
//!
24+
//! # References
25+
//!
26+
//! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5)
27+
//! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm)
28+
29+
use std::cmp::max;
30+
31+
/// Calculates the score for a character pair based on match, mismatch, or gap scoring.
32+
///
33+
/// # Arguments
34+
///
35+
/// * `source_char` - Character from the source sequence
36+
/// * `target_char` - Character from the target sequence
37+
/// * `match_score` - Score awarded for matching characters (typically positive)
38+
/// * `mismatch_score` - Score penalty for mismatching characters (typically negative)
39+
/// * `gap_score` - Score penalty for gaps (typically negative)
40+
///
41+
/// # Returns
42+
///
43+
/// The calculated score for the character pair
44+
///
45+
/// # Examples
46+
///
47+
/// ```
48+
/// use the_algorithms_rust::dynamic_programming::score_function;
49+
///
50+
/// let score = score_function('A', 'A', 1, -1, -2);
51+
/// assert_eq!(score, 1); // Match
52+
///
53+
/// let score = score_function('A', 'C', 1, -1, -2);
54+
/// assert_eq!(score, -1); // Mismatch
55+
///
56+
/// let score = score_function('-', 'A', 1, -1, -2);
57+
/// assert_eq!(score, -2); // Gap
58+
/// ```
59+
pub fn score_function(
60+
source_char: char,
61+
target_char: char,
62+
match_score: i32,
63+
mismatch_score: i32,
64+
gap_score: i32,
65+
) -> i32 {
66+
if source_char == '-' || target_char == '-' {
67+
gap_score
68+
} else if source_char == target_char {
69+
match_score
70+
} else {
71+
mismatch_score
72+
}
73+
}
74+
75+
/// Performs the Smith-Waterman local sequence alignment algorithm.
76+
///
77+
/// This function creates a scoring matrix using dynamic programming to find the
78+
/// optimal local alignment between two sequences. The algorithm is case-insensitive.
79+
///
80+
/// # Arguments
81+
///
82+
/// * `query` - The query sequence (e.g., DNA, protein)
83+
/// * `subject` - The subject sequence to align against
84+
/// * `match_score` - Score for matching characters (default: 1)
85+
/// * `mismatch_score` - Penalty for mismatching characters (default: -1)
86+
/// * `gap_score` - Penalty for gaps/indels (default: -2)
87+
///
88+
/// # Returns
89+
///
90+
/// A 2D vector representing the dynamic programming scoring matrix
91+
///
92+
/// # Examples
93+
///
94+
/// ```
95+
/// use the_algorithms_rust::dynamic_programming::smith_waterman;
96+
///
97+
/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2);
98+
/// assert_eq!(score_matrix.len(), 5); // query length + 1
99+
/// assert_eq!(score_matrix[0].len(), 3); // subject length + 1
100+
/// ```
101+
pub fn smith_waterman(
102+
query: &str,
103+
subject: &str,
104+
match_score: i32,
105+
mismatch_score: i32,
106+
gap_score: i32,
107+
) -> Vec<Vec<i32>> {
108+
let query_upper: Vec<char> = query.to_uppercase().chars().collect();
109+
let subject_upper: Vec<char> = subject.to_uppercase().chars().collect();
110+
111+
let m = query_upper.len();
112+
let n = subject_upper.len();
113+
114+
// Initialize scoring matrix with zeros
115+
let mut score = vec![vec![0; n + 1]; m + 1];
116+
117+
// Fill the scoring matrix using dynamic programming
118+
for i in 1..=m {
119+
for j in 1..=n {
120+
// Calculate score for match/mismatch
121+
let match_or_mismatch = score[i - 1][j - 1]
122+
+ score_function(
123+
query_upper[i - 1],
124+
subject_upper[j - 1],
125+
match_score,
126+
mismatch_score,
127+
gap_score,
128+
);
129+
130+
// Calculate score for deletion (gap in subject)
131+
let delete = score[i - 1][j] + gap_score;
132+
133+
// Calculate score for insertion (gap in query)
134+
let insert = score[i][j - 1] + gap_score;
135+
136+
// Take maximum of all options, but never go below 0 (local alignment)
137+
score[i][j] = max(0, max(match_or_mismatch, max(delete, insert)));
138+
}
139+
}
140+
141+
score
142+
}
143+
144+
/// Performs traceback on the Smith-Waterman score matrix to reconstruct the optimal alignment.
145+
///
146+
/// This function starts from the highest scoring cell and traces back through the matrix
147+
/// to reconstruct the aligned sequences. The traceback stops when a cell with score 0
148+
/// is encountered.
149+
///
150+
/// # Arguments
151+
///
152+
/// * `score` - The score matrix from the Smith-Waterman algorithm
153+
/// * `query` - Original query sequence used in alignment
154+
/// * `subject` - Original subject sequence used in alignment
155+
///
156+
/// # Returns
157+
///
158+
/// A String containing the two aligned sequences separated by a newline,
159+
/// or an empty string if no significant alignment is found
160+
///
161+
/// # Examples
162+
///
163+
/// ```
164+
/// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback};
165+
///
166+
/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2);
167+
/// let alignment = traceback(&score_matrix, "ACAC", "CA");
168+
/// assert_eq!(alignment, "CA\nCA");
169+
/// ```
170+
pub fn traceback(score: &[Vec<i32>], query: &str, subject: &str) -> String {
171+
let query_upper: Vec<char> = query.to_uppercase().chars().collect();
172+
let subject_upper: Vec<char> = subject.to_uppercase().chars().collect();
173+
174+
// Find the cell with maximum score
175+
let mut max_value = i32::MIN;
176+
let (mut i_max, mut j_max) = (0, 0);
177+
178+
for (i, row) in score.iter().enumerate() {
179+
for (j, &value) in row.iter().enumerate() {
180+
if value > max_value {
181+
max_value = value;
182+
i_max = i;
183+
j_max = j;
184+
}
185+
}
186+
}
187+
188+
// If no significant alignment found, return empty string
189+
if i_max == 0 || j_max == 0 {
190+
return String::new();
191+
}
192+
193+
// Traceback from the maximum scoring cell
194+
let (mut i, mut j) = (i_max, j_max);
195+
let mut align1 = String::new();
196+
let mut align2 = String::new();
197+
198+
// Continue tracing back until we hit a cell with score 0
199+
while i > 0 && j > 0 && score[i][j] > 0 {
200+
let current_score = score[i][j];
201+
202+
// Check if we came from diagonal (match/mismatch)
203+
if current_score
204+
== score[i - 1][j - 1]
205+
+ score_function(query_upper[i - 1], subject_upper[j - 1], 1, -1, -2)
206+
{
207+
align1.insert(0, query_upper[i - 1]);
208+
align2.insert(0, subject_upper[j - 1]);
209+
i -= 1;
210+
j -= 1;
211+
}
212+
// Check if we came from above (deletion/gap in subject)
213+
else if current_score == score[i - 1][j] - 2 {
214+
align1.insert(0, query_upper[i - 1]);
215+
align2.insert(0, '-');
216+
i -= 1;
217+
}
218+
// Otherwise we came from left (insertion/gap in query)
219+
else {
220+
align1.insert(0, '-');
221+
align2.insert(0, subject_upper[j - 1]);
222+
j -= 1;
223+
}
224+
}
225+
226+
format!("{align1}\n{align2}")
227+
}
228+
229+
#[cfg(test)]
230+
mod tests {
231+
use super::*;
232+
233+
macro_rules! smith_waterman_tests {
234+
($($name:ident: $test_cases:expr,)*) => {
235+
$(
236+
#[test]
237+
fn $name() {
238+
let (query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases;
239+
assert_eq!(smith_waterman(query, subject, match_score, mismatch_score, gap_score), expected);
240+
}
241+
)*
242+
}
243+
}
244+
245+
macro_rules! traceback_tests {
246+
($($name:ident: $test_cases:expr,)*) => {
247+
$(
248+
#[test]
249+
fn $name() {
250+
let (score, query, subject, expected) = $test_cases;
251+
assert_eq!(traceback(&score, query, subject), expected);
252+
}
253+
)*
254+
}
255+
}
256+
257+
smith_waterman_tests! {
258+
test_acac_ca: ("ACAC", "CA", 1, -1, -2, vec![
259+
vec![0, 0, 0],
260+
vec![0, 0, 1],
261+
vec![0, 1, 0],
262+
vec![0, 0, 2],
263+
vec![0, 1, 0],
264+
]),
265+
test_agt_agt: ("AGT", "AGT", 1, -1, -2, vec![
266+
vec![0, 0, 0, 0],
267+
vec![0, 1, 0, 0],
268+
vec![0, 0, 2, 0],
269+
vec![0, 0, 0, 3],
270+
]),
271+
test_agt_gta: ("AGT", "GTA", 1, -1, -2, vec![
272+
vec![0, 0, 0, 0],
273+
vec![0, 0, 0, 1],
274+
vec![0, 1, 0, 0],
275+
vec![0, 0, 2, 0],
276+
]),
277+
test_agt_g: ("AGT", "G", 1, -1, -2, vec![
278+
vec![0, 0],
279+
vec![0, 0],
280+
vec![0, 1],
281+
vec![0, 0],
282+
]),
283+
test_g_agt: ("G", "AGT", 1, -1, -2, vec![
284+
vec![0, 0, 0, 0],
285+
vec![0, 0, 1, 0],
286+
]),
287+
test_empty_query: ("", "CA", 1, -1, -2, vec![vec![0, 0, 0]]),
288+
test_empty_subject: ("ACAC", "", 1, -1, -2, vec![vec![0], vec![0], vec![0], vec![0], vec![0]]),
289+
test_both_empty: ("", "", 1, -1, -2, vec![vec![0]]),
290+
}
291+
292+
traceback_tests! {
293+
test_traceback_acac_ca: (
294+
vec![
295+
vec![0, 0, 0],
296+
vec![0, 0, 1],
297+
vec![0, 1, 0],
298+
vec![0, 0, 2],
299+
vec![0, 1, 0],
300+
],
301+
"ACAC",
302+
"CA",
303+
"CA\nCA",
304+
),
305+
test_traceback_agt_agt: (
306+
vec![
307+
vec![0, 0, 0, 0],
308+
vec![0, 1, 0, 0],
309+
vec![0, 0, 2, 0],
310+
vec![0, 0, 0, 3],
311+
],
312+
"AGT",
313+
"AGT",
314+
"AGT\nAGT",
315+
),
316+
test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", ""),
317+
}
318+
319+
#[test]
320+
fn test_score_function_match() {
321+
assert_eq!(score_function('A', 'A', 1, -1, -2), 1);
322+
assert_eq!(score_function('G', 'G', 2, -1, -1), 2);
323+
}
324+
325+
#[test]
326+
fn test_score_function_mismatch() {
327+
assert_eq!(score_function('A', 'C', 1, -1, -2), -1);
328+
assert_eq!(score_function('G', 'T', 1, -2, -1), -2);
329+
}
330+
331+
#[test]
332+
fn test_score_function_gap() {
333+
assert_eq!(score_function('-', 'A', 1, -1, -2), -2);
334+
assert_eq!(score_function('A', '-', 1, -1, -2), -2);
335+
}
336+
337+
#[test]
338+
fn test_case_insensitive() {
339+
let result1 = smith_waterman("acac", "CA", 1, -1, -2);
340+
let result2 = smith_waterman("ACAC", "ca", 1, -1, -2);
341+
let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2);
342+
343+
assert_eq!(result1, result2);
344+
assert_eq!(result2, result3);
345+
}
346+
}

0 commit comments

Comments
 (0)