Skip to content

Commit aa24550

Browse files
feat: add principle component analysis (#999)
1 parent d246ca5 commit aa24550

File tree

3 files changed

+328
-0
lines changed

3 files changed

+328
-0
lines changed

DIRECTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@
209209
* [Linear Regression](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/linear_regression.rs)
210210
* [Logistic Regression](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/logistic_regression.rs)
211211
* [Naive Bayes](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/naive_bayes.rs)
212+
* [Principal Component Analysis](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/principal_component_analysis.rs)
212213
* Loss Function
213214
* [Average Margin Ranking Loss](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/loss_function/average_margin_ranking_loss.rs)
214215
* [Hinge Loss](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/loss_function/hinge_loss.rs)

src/machine_learning/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod logistic_regression;
66
mod loss_function;
77
mod naive_bayes;
88
mod optimization;
9+
mod principal_component_analysis;
910

1011
pub use self::cholesky::cholesky;
1112
pub use self::k_means::k_means;
@@ -22,3 +23,4 @@ pub use self::loss_function::neg_log_likelihood;
2223
pub use self::naive_bayes::naive_bayes;
2324
pub use self::optimization::gradient_descent;
2425
pub use self::optimization::Adam;
26+
pub use self::principal_component_analysis::principal_component_analysis;
Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,325 @@
1+
/// Principal Component Analysis (PCA) for dimensionality reduction.
2+
/// PCA transforms data to a new coordinate system where the greatest
3+
/// variance lies on the first coordinate (first principal component),
4+
/// the second greatest variance on the second coordinate, and so on.
5+
6+
/// Compute the mean of each feature across all samples
7+
fn compute_means(data: &[Vec<f64>]) -> Vec<f64> {
8+
if data.is_empty() {
9+
return vec![];
10+
}
11+
12+
let num_features = data[0].len();
13+
let mut means = vec![0.0; num_features];
14+
15+
for sample in data {
16+
for (i, &feature) in sample.iter().enumerate() {
17+
means[i] += feature;
18+
}
19+
}
20+
21+
let n = data.len() as f64;
22+
for mean in &mut means {
23+
*mean /= n;
24+
}
25+
26+
means
27+
}
28+
29+
/// Center the data by subtracting the mean from each feature
30+
fn center_data(data: &[Vec<f64>], means: &[f64]) -> Vec<Vec<f64>> {
31+
data.iter()
32+
.map(|sample| {
33+
sample
34+
.iter()
35+
.zip(means.iter())
36+
.map(|(&x, &mean)| x - mean)
37+
.collect()
38+
})
39+
.collect()
40+
}
41+
42+
/// Compute covariance matrix from centered data
43+
fn compute_covariance_matrix(centered_data: &[Vec<f64>]) -> Vec<f64> {
44+
if centered_data.is_empty() {
45+
return vec![];
46+
}
47+
48+
let n = centered_data.len();
49+
let num_features = centered_data[0].len();
50+
51+
let mut cov_matrix = vec![0.0; num_features * num_features];
52+
53+
for i in 0..num_features {
54+
for j in i..num_features {
55+
let mut cov = 0.0;
56+
for sample in centered_data {
57+
cov += sample[i] * sample[j];
58+
}
59+
cov /= n as f64;
60+
61+
cov_matrix[i * num_features + j] = cov;
62+
cov_matrix[j * num_features + i] = cov;
63+
}
64+
}
65+
66+
cov_matrix
67+
}
68+
69+
/// Power iteration method to find the dominant eigenvalue and eigenvector
70+
fn power_iteration(matrix: &[f64], n: usize, max_iter: usize, tolerance: f64) -> (f64, Vec<f64>) {
71+
let mut b_k = vec![1.0; n];
72+
let mut b_k_prev = vec![0.0; n];
73+
74+
for _ in 0..max_iter {
75+
b_k_prev.clone_from(&b_k);
76+
77+
let mut b_k_new = vec![0.0; n];
78+
for i in 0..n {
79+
for j in 0..n {
80+
b_k_new[i] += matrix[i * n + j] * b_k[j];
81+
}
82+
}
83+
84+
let norm = b_k_new.iter().map(|x| x * x).sum::<f64>().sqrt();
85+
if norm > 1e-10 {
86+
for val in &mut b_k_new {
87+
*val /= norm;
88+
}
89+
}
90+
91+
b_k = b_k_new;
92+
93+
let diff: f64 = b_k
94+
.iter()
95+
.zip(b_k_prev.iter())
96+
.map(|(a, b)| (a - b).abs())
97+
.fold(0.0, |acc, x| acc.max(x));
98+
99+
if diff < tolerance {
100+
break;
101+
}
102+
}
103+
104+
let eigenvalue = b_k
105+
.iter()
106+
.enumerate()
107+
.map(|(i, &val)| {
108+
let mut row_sum = 0.0;
109+
for j in 0..n {
110+
row_sum += matrix[i * n + j] * b_k[j];
111+
}
112+
row_sum * val
113+
})
114+
.sum::<f64>()
115+
/ b_k.iter().map(|x| x * x).sum::<f64>();
116+
117+
(eigenvalue, b_k)
118+
}
119+
120+
/// Deflate a matrix by removing the component along a given eigenvector
121+
fn deflate_matrix(matrix: &[f64], eigenvector: &[f64], eigenvalue: f64, n: usize) -> Vec<f64> {
122+
let mut deflated = matrix.to_vec();
123+
124+
for i in 0..n {
125+
for j in 0..n {
126+
deflated[i * n + j] -= eigenvalue * eigenvector[i] * eigenvector[j];
127+
}
128+
}
129+
130+
deflated
131+
}
132+
133+
/// Perform PCA on the input data
134+
/// Returns transformed data with reduced dimensions
135+
pub fn principal_component_analysis(
136+
data: Vec<Vec<f64>>,
137+
num_components: usize,
138+
) -> Option<Vec<Vec<f64>>> {
139+
if data.is_empty() {
140+
return None;
141+
}
142+
143+
let num_features = data[0].len();
144+
145+
if num_features == 0 {
146+
return None;
147+
}
148+
149+
if num_components > num_features {
150+
return None;
151+
}
152+
153+
if num_components == 0 {
154+
return None;
155+
}
156+
157+
let means = compute_means(&data);
158+
let centered_data = center_data(&data, &means);
159+
let cov_matrix = compute_covariance_matrix(&centered_data);
160+
161+
let mut eigenvectors = Vec::new();
162+
let mut deflated_matrix = cov_matrix;
163+
164+
for _ in 0..num_components {
165+
let (_eigenvalue, eigenvector) =
166+
power_iteration(&deflated_matrix, num_features, 1000, 1e-10);
167+
eigenvectors.push(eigenvector);
168+
deflated_matrix = deflate_matrix(
169+
&deflated_matrix,
170+
eigenvectors.last().unwrap(),
171+
_eigenvalue,
172+
num_features,
173+
);
174+
}
175+
176+
let transformed_data: Vec<Vec<f64>> = centered_data
177+
.iter()
178+
.map(|sample| {
179+
(0..num_components)
180+
.map(|k| {
181+
eigenvectors[k]
182+
.iter()
183+
.zip(sample.iter())
184+
.map(|(&ev, &s)| ev * s)
185+
.sum::<f64>()
186+
})
187+
.collect()
188+
})
189+
.collect();
190+
191+
Some(transformed_data)
192+
}
193+
194+
#[cfg(test)]
195+
mod test {
196+
use super::*;
197+
198+
#[test]
199+
fn test_pca_simple() {
200+
let data = vec![
201+
vec![1.0, 2.0],
202+
vec![2.0, 3.0],
203+
vec![3.0, 4.0],
204+
vec![4.0, 5.0],
205+
vec![5.0, 6.0],
206+
];
207+
208+
let result = principal_component_analysis(data, 1);
209+
assert!(result.is_some());
210+
211+
let transformed = result.unwrap();
212+
assert_eq!(transformed.len(), 5);
213+
assert_eq!(transformed[0].len(), 1);
214+
215+
let all_values: Vec<f64> = transformed.iter().map(|v| v[0]).collect();
216+
let mean = all_values.iter().sum::<f64>() / all_values.len() as f64;
217+
218+
assert!((mean).abs() < 1e-5);
219+
}
220+
221+
#[test]
222+
fn test_pca_empty_data() {
223+
let data = vec![];
224+
let result = principal_component_analysis(data, 2);
225+
assert_eq!(result, None);
226+
}
227+
228+
#[test]
229+
fn test_pca_empty_features() {
230+
let data = vec![vec![], vec![]];
231+
let result = principal_component_analysis(data, 1);
232+
assert_eq!(result, None);
233+
}
234+
235+
#[test]
236+
fn test_pca_invalid_num_components() {
237+
let data = vec![vec![1.0, 2.0], vec![2.0, 3.0]];
238+
239+
let result = principal_component_analysis(data.clone(), 3);
240+
assert_eq!(result, None);
241+
242+
let result = principal_component_analysis(data, 0);
243+
assert_eq!(result, None);
244+
}
245+
246+
#[test]
247+
fn test_pca_preserves_dimensions() {
248+
let data = vec![
249+
vec![1.0, 2.0, 3.0],
250+
vec![4.0, 5.0, 6.0],
251+
vec![7.0, 8.0, 9.0],
252+
];
253+
254+
let result = principal_component_analysis(data, 2);
255+
assert!(result.is_some());
256+
257+
let transformed = result.unwrap();
258+
assert_eq!(transformed.len(), 3);
259+
assert_eq!(transformed[0].len(), 2);
260+
}
261+
262+
#[test]
263+
fn test_pca_reconstruction_variance() {
264+
let data = vec![
265+
vec![2.5, 2.4],
266+
vec![0.5, 0.7],
267+
vec![2.2, 2.9],
268+
vec![1.9, 2.2],
269+
vec![3.1, 3.0],
270+
vec![2.3, 2.7],
271+
vec![2.0, 1.6],
272+
vec![1.0, 1.1],
273+
vec![1.5, 1.6],
274+
vec![1.1, 0.9],
275+
];
276+
277+
let result = principal_component_analysis(data, 1);
278+
assert!(result.is_some());
279+
280+
let transformed = result.unwrap();
281+
assert_eq!(transformed.len(), 10);
282+
assert_eq!(transformed[0].len(), 1);
283+
}
284+
285+
#[test]
286+
fn test_center_data() {
287+
let data = vec![
288+
vec![1.0, 2.0, 3.0],
289+
vec![4.0, 5.0, 6.0],
290+
vec![7.0, 8.0, 9.0],
291+
];
292+
293+
let means = vec![4.0, 5.0, 6.0];
294+
let centered = center_data(&data, &means);
295+
296+
assert_eq!(centered[0], vec![-3.0, -3.0, -3.0]);
297+
assert_eq!(centered[1], vec![0.0, 0.0, 0.0]);
298+
assert_eq!(centered[2], vec![3.0, 3.0, 3.0]);
299+
}
300+
301+
#[test]
302+
fn test_compute_means() {
303+
let data = vec![
304+
vec![1.0, 2.0, 3.0],
305+
vec![4.0, 5.0, 6.0],
306+
vec![7.0, 8.0, 9.0],
307+
];
308+
309+
let means = compute_means(&data);
310+
assert_eq!(means, vec![4.0, 5.0, 6.0]);
311+
}
312+
313+
#[test]
314+
fn test_power_iteration() {
315+
let matrix = vec![4.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 2.0];
316+
317+
let (eigenvalue, eigenvector) = power_iteration(&matrix, 3, 1000, 1e-10);
318+
319+
assert!(eigenvalue > 0.0);
320+
assert_eq!(eigenvector.len(), 3);
321+
322+
let norm = eigenvector.iter().map(|x| x * x).sum::<f64>().sqrt();
323+
assert!((norm - 1.0).abs() < 1e-6);
324+
}
325+
}

0 commit comments

Comments
 (0)