diff --git a/Cargo.lock b/Cargo.lock index c337c95..7054777 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -424,7 +424,10 @@ dependencies = [ "codspeed-divan-compat", "image", "image-compare", + "memchr", + "memmap2", "rand", + "rayon", ] [[package]] @@ -749,6 +752,15 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "memmap2" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" +dependencies = [ + "libc", +] + [[package]] name = "minimal-lexical" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index f4172be..8ff3f4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,9 @@ path = "src/lib.rs" rand = "0.8" image = "0.25" image-compare = "0.5.0" +memmap2 = "0.9" +rayon = "1.10" +memchr = "2.7" [dev-dependencies] divan = { version = "4.0.2", package = "codspeed-divan-compat" } diff --git a/benches/blob_corruption_checker.rs b/benches/blob_corruption_checker.rs index 3ad54d6..9e74386 100644 --- a/benches/blob_corruption_checker.rs +++ b/benches/blob_corruption_checker.rs @@ -1,5 +1,5 @@ use divan::Bencher; -use eurorust_2025_workshop::blob_corruption_checker::find_corruptions_sequential; +use eurorust_2025_workshop::blob_corruption_checker::find_corruptions_parallel; fn main() { divan::main(); @@ -8,7 +8,7 @@ fn main() { #[divan::bench(sample_count = 3, sample_size = 5)] fn corruption_check(bencher: Bencher) { bencher.bench_local(|| { - let corruptions = divan::black_box(find_corruptions_sequential( + let corruptions = divan::black_box(find_corruptions_parallel( "reference.bin", "corrupted.bin", 1024, // 1KB chunks @@ -18,14 +18,25 @@ fn corruption_check(bencher: Bencher) { // All corruptions should be 1KB aligned for corruption in &corruptions { - assert_eq!(corruption.offset % 1024, 0, "Corruption offset should be 1KB aligned"); - assert_eq!(corruption.length % 1024, 0, "Corruption length should be multiple of 1KB"); + assert_eq!( + corruption.offset % 1024, + 0, + "Corruption offset should be 1KB aligned" + ); + assert_eq!( + corruption.length % 1024, + 0, + "Corruption length should be multiple of 1KB" + ); } // Check specific corruptions assert_eq!(corruptions[0].offset, 14801920, "First corruption offset"); assert_eq!(corruptions[0].length, 2048, "First corruption length"); - assert_eq!(corruptions[25].offset, 243891200, "Middle corruption offset"); + assert_eq!( + corruptions[25].offset, 243891200, + "Middle corruption offset" + ); assert_eq!(corruptions[25].length, 4096, "Middle corruption length"); assert_eq!(corruptions[49].offset, 507871232, "Last corruption offset"); assert_eq!(corruptions[49].length, 5120, "Last corruption length"); diff --git a/benches/dna_matcher.rs b/benches/dna_matcher.rs index c955168..6011a37 100644 --- a/benches/dna_matcher.rs +++ b/benches/dna_matcher.rs @@ -11,7 +11,7 @@ fn dna_matcher() { ); let pattern = "AGTCCGTA"; - let matches = divan::black_box(naive_dna_matcher( + let matches = divan::black_box(exported_dna_matcher( divan::black_box(&genome), divan::black_box(pattern), )); diff --git a/benches/lut_grayscale_bench.rs b/benches/lut_grayscale_bench.rs index 5816569..64add8b 100644 --- a/benches/lut_grayscale_bench.rs +++ b/benches/lut_grayscale_bench.rs @@ -1,5 +1,5 @@ use eurorust_2025_workshop::lut_grayscale::*; -use image::{RgbImage}; +use image::RgbImage; fn main() { divan::main(); diff --git a/src/bfs.rs b/src/bfs.rs index 487fddc..938102d 100644 --- a/src/bfs.rs +++ b/src/bfs.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::VecDeque; /// A simple graph represented as an adjacency list #[derive(Debug, Clone)] @@ -26,22 +26,23 @@ impl Graph { /// Naive BFS implementation using Vec as a queue (intentionally slow) /// Returns the order in which nodes were visited pub fn bfs_naive(graph: &Graph, start: usize) -> Vec { - let mut visited = HashSet::new(); - let mut queue = Vec::new(); // Using Vec instead of VecDeque - intentionally inefficient! - let mut result = Vec::new(); + let mut visited = vec![false; graph.num_nodes()]; + let mut queue = VecDeque::new(); // Using VecDeque for efficient FIFO queue + let mut result = Vec::with_capacity(graph.num_nodes()); - queue.push(start); - visited.insert(start); + queue.push_back(start); + visited[start] = true; while !queue.is_empty() { - // remove(0) is O(n) - this makes BFS slow! - let node = queue.remove(0); + // pop_front() is O(1) - this makes BFS efficient! + let node = queue.pop_front().unwrap(); result.push(node); if let Some(neighbors) = graph.adjacency.get(node) { for &neighbor in neighbors { - if visited.insert(neighbor) { - queue.push(neighbor); + if !visited[neighbor] { + visited[neighbor] = true; + queue.push_back(neighbor); } } } diff --git a/src/blob_corruption_checker.rs b/src/blob_corruption_checker.rs index 2515c20..863e750 100644 --- a/src/blob_corruption_checker.rs +++ b/src/blob_corruption_checker.rs @@ -1,3 +1,5 @@ +use memmap2::Mmap; +use rayon::prelude::*; use std::fs::File; use std::io::{BufReader, Read}; @@ -60,6 +62,85 @@ pub fn find_corruptions_sequential( corruptions } +pub fn find_corruptions_parallel( + reference_path: &str, + corrupted_path: &str, + chunk_size: usize, +) -> Vec { + // Memory map both files + let ref_file = File::open(reference_path).unwrap(); + let corrupt_file = File::open(corrupted_path).unwrap(); + + // it is fine to use unsafe here since the files are not modified while mapped + let ref_mmap = unsafe { Mmap::map(&ref_file).unwrap() }; + let corrupt_mmap = unsafe { Mmap::map(&corrupt_file).unwrap() }; + + let file_size = ref_mmap.len(); + + // Divide the file into chunks and process in parallel + let num_chunks = (file_size + chunk_size - 1) / chunk_size; + + // Use fold/reduce pattern to stream and merge results + let corruptions = (0..num_chunks) + .into_par_iter() + .fold( + Vec::new, + |mut acc: Vec, chunk_idx| { + let offset = chunk_idx * chunk_size; + let end = std::cmp::min(offset + chunk_size, file_size); + let len = end - offset; + + let ref_chunk = &ref_mmap[offset..end]; + let corrupt_chunk = &corrupt_mmap[offset..end]; + + if ref_chunk != corrupt_chunk { + let corruption = Corruption { + offset: offset as u64, + length: len as u64, + }; + + // Try to merge with the last corruption in this thread's accumulator + if let Some(last) = acc.last_mut() { + if last.offset + last.length == corruption.offset { + last.length += corruption.length; + } else { + acc.push(corruption); + } + } else { + acc.push(corruption); + } + } + + acc + }, + ) + .reduce(Vec::new, |mut a, b| { + // Merge two vectors of corruptions + if a.is_empty() { + return b; + } + if b.is_empty() { + return a; + } + + // Check if we can merge the last of 'a' with the first of 'b' + let last_a = a.last_mut().unwrap(); + let mut b_iter = b.into_iter(); + let first_b = b_iter.next().unwrap(); + + if last_a.offset + last_a.length == first_b.offset { + last_a.length += first_b.length; + } else { + a.push(first_b); + } + + a.extend(b_iter); + a + }); + + corruptions +} + #[cfg(test)] mod tests { use super::*; @@ -92,10 +173,39 @@ mod tests { "Middle corruption offset" ); assert_eq!(corruptions[25].length, 4096, "Middle corruption length"); + assert_eq!(corruptions[49].offset, 507871232, "Last corruption offset"); + assert_eq!(corruptions[49].length, 5120, "Last corruption length"); + } + + #[test] + fn test_find_corruptions_parallel() { + let corruptions = find_corruptions_parallel("reference.bin", "corrupted.bin", 1024); + + assert_eq!(corruptions.len(), 50, "Should find 50 corruptions"); + + // All corruptions should be 1KB aligned + for corruption in &corruptions { + assert_eq!( + corruption.offset % 1024, + 0, + "Corruption offset should be 1KB aligned" + ); + assert_eq!( + corruption.length % 1024, + 0, + "Corruption length should be multiple of 1KB" + ); + } + + // Check specific corruptions + assert_eq!(corruptions[0].offset, 14801920, "First corruption offset"); + assert_eq!(corruptions[0].length, 2048, "First corruption length"); assert_eq!( - corruptions[49].offset, 507871232, - "Last corruption offset" + corruptions[25].offset, 243891200, + "Middle corruption offset" ); + assert_eq!(corruptions[25].length, 4096, "Middle corruption length"); + assert_eq!(corruptions[49].offset, 507871232, "Last corruption offset"); assert_eq!(corruptions[49].length, 5120, "Last corruption length"); } } diff --git a/src/dna_matcher.rs b/src/dna_matcher.rs index d99c90e..582bd62 100644 --- a/src/dna_matcher.rs +++ b/src/dna_matcher.rs @@ -1,13 +1,97 @@ +use rayon::prelude::*; +use memchr::memmem; + +pub fn exported_dna_matcher(genome: &str, pattern: &str) -> Vec { + chunked_dna_matcher(genome, pattern) +} + /// Naive approach: Read the entire file as a string and filter lines -pub fn naive_dna_matcher(genome: &str, pattern: &str) -> Vec { +fn naive_dna_matcher(genome: &str, pattern: &str) -> Vec { genome - .lines() + .par_lines() .filter(|line| !line.starts_with('>')) // Skip headers .filter(|line| line.contains(pattern)) .map(|s| s.to_string()) .collect() } +/// Chunked approach: Process genome in parallel byte chunks +fn chunked_dna_matcher(genome: &str, pattern: &str) -> Vec { + let pattern_bytes = pattern.as_bytes(); + let genome_bytes = genome.as_bytes(); + let finder = memmem::Finder::new(pattern_bytes); + + // Chunk size: balance between parallelism and overhead + // Aim for ~1000 lines per chunk, with typical DNA line length of 60-80 chars + let chunk_size = 64 * 1024; // 64KB per chunk + let total_len = genome_bytes.len(); + + // Find chunk boundaries that align with line boundaries + let mut chunk_starts = vec![0]; + let mut pos = chunk_size; + + while pos < total_len { + // Find the next newline after pos + let search_start = pos; + let search_end = std::cmp::min(pos + 1024, total_len); // Look ahead up to 1KB for newline + + if let Some(newline_offset) = memchr::memchr(b'\n', &genome_bytes[search_start..search_end]) { + chunk_starts.push(search_start + newline_offset + 1); + pos = search_start + newline_offset + 1 + chunk_size; + } else { + // No newline found, just use the current position + chunk_starts.push(pos); + pos += chunk_size; + } + } + chunk_starts.push(total_len); + + // Process chunks in parallel + let matches: Vec = (0..chunk_starts.len() - 1) + .into_par_iter() + .flat_map(|i| { + let chunk_start = chunk_starts[i]; + let chunk_end = chunk_starts[i + 1]; + let chunk = &genome_bytes[chunk_start..chunk_end]; + + let mut local_matches = Vec::new(); + let mut line_start = 0; + + // Use memchr_iter for faster newline finding + for newline_pos in memchr::memchr_iter(b'\n', chunk) { + let line = &chunk[line_start..newline_pos]; + line_start = newline_pos + 1; + + // Skip headers and empty lines + if !line.is_empty() && line[0] != b'>' { + // Use memmem for fast substring search + if finder.find(line).is_some() { + // SAFETY: DNA sequences are ASCII-only, so we can skip UTF-8 validation + let line_str = unsafe { std::str::from_utf8_unchecked(line) }; + local_matches.push(line_str.to_string()); + } + } + } + + // Handle last line if chunk doesn't end with newline + if line_start < chunk.len() { + let line = &chunk[line_start..]; + if !line.is_empty() && line[0] != b'>' { + if finder.find(line).is_some() { + // SAFETY: DNA sequences are ASCII-only, so we can skip UTF-8 validation + let line_str = unsafe { std::str::from_utf8_unchecked(line) }; + local_matches.push(line_str.to_string()); + } + } + } + + local_matches + }) + .collect(); + + matches +} + #[cfg(test)] mod tests { use super::*; @@ -16,7 +100,7 @@ mod tests { fn test_naive_matcher() { let test_genome = ">seq1\nACGTACGT\n>seq2\nAGTCCGTAAA\n>seq3\nGGGGGG"; let pattern = "AGTCCGTA"; - let matches = naive_dna_matcher(test_genome, pattern); + let matches = exported_dna_matcher(test_genome, pattern); assert_eq!(matches.len(), 1); assert_eq!(matches[0], "AGTCCGTAAA"); } @@ -28,7 +112,7 @@ mod tests { .expect("Failed to read genome.fasta\n\n Make sure to run 'cargo run --release --bin generate_fasta'"); let pattern = "AGTCCGTA"; - let matches = naive_dna_matcher(&genome, pattern); + let matches = exported_dna_matcher(&genome, pattern); // With fixed seed (42), we should always get exactly 4927 matches assert_eq!( diff --git a/src/lut_filters.rs b/src/lut_filters.rs index a73068c..974a12e 100644 --- a/src/lut_filters.rs +++ b/src/lut_filters.rs @@ -31,8 +31,29 @@ pub fn apply_brightness_contrast_gamma( contrast: f32, gamma: f32, ) -> RgbImage { - let temp_img = apply_brightness_contrast(img, brightness, contrast); - naive::apply_gamma(&temp_img, gamma) + let (width, height) = img.dimensions(); + let mut output = ImageBuffer::new(width, height); + + // precompute two lookup tables at once + let mut brightness_table = [0u8; 256]; // pixels are u8 (0-255) + let mut gamma_table = [0u8; 256]; // pixels are u8 (0-255) + + for i in 0..256 { + brightness_table[i] = (((i as f32 - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32) + .clamp(0.0, 255.0) as u8; + gamma_table[i] = ((i as f32 / 255.0).powf(1.0 / gamma) * 255.0).clamp(0.0, 255.0) as u8; + } + + // apply first the brightness/contrast, then gamma + for (x, y, pixel) in img.enumerate_pixels() { + let r = gamma_table[brightness_table[pixel[0] as usize] as usize] as u8; + let g = gamma_table[brightness_table[pixel[1] as usize] as usize] as u8; + let b = gamma_table[brightness_table[pixel[2] as usize] as usize] as u8; + + output.put_pixel(x, y, Rgb([r, g, b])); + } + + output } mod naive { @@ -43,23 +64,26 @@ mod naive { let (width, height) = img.dimensions(); let mut output = ImageBuffer::new(width, height); - for (x, y, pixel) in img.enumerate_pixels() { - let r = pixel[0] as f32; - let g = pixel[1] as f32; - let b = pixel[2] as f32; + let mut brightness_table = [0u8; 256]; // pixels are u8 (0-255) + + // Precompute brightness table + for i in 0..256 { + let toto = ((i as f32 - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32; + brightness_table[i] = toto.clamp(0.0, 255.0) as u8; + } - // Apply contrast and brightness (5 FP ops per channel!) - let r = ((r - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32; - let g = ((g - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32; - let b = ((b - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32; + for (x, y, pixel) in img.enumerate_pixels() { + let r = pixel[0]; + let g = pixel[1]; + let b = pixel[2]; output.put_pixel( x, y, Rgb([ - r.clamp(0.0, 255.0) as u8, - g.clamp(0.0, 255.0) as u8, - b.clamp(0.0, 255.0) as u8, + brightness_table[r as usize], + brightness_table[g as usize], + brightness_table[b as usize], ]), ); } @@ -67,17 +91,20 @@ mod naive { output } - /// Naive implementation: Apply gamma correction - /// This is VERY slow because powf() is expensive! pub fn apply_gamma(img: &RgbImage, gamma: f32) -> RgbImage { let (width, height) = img.dimensions(); let mut output = ImageBuffer::new(width, height); + let mut gamma_table = [0u8; 256]; // pixels are u8 (0-255) + // Precompute gamma table + for i in 0..256 { + gamma_table[i] = ((i as f32 / 255.0).powf(1.0 / gamma) * 255.0).clamp(0.0, 255.0) as u8; + } + for (x, y, pixel) in img.enumerate_pixels() { - // powf() is VERY expensive - this is why we need a LUT! - let r = (pixel[0] as f32 / 255.0).powf(1.0 / gamma) * 255.0; - let g = (pixel[1] as f32 / 255.0).powf(1.0 / gamma) * 255.0; - let b = (pixel[2] as f32 / 255.0).powf(1.0 / gamma) * 255.0; + let r = gamma_table[pixel[0] as usize]; + let g = gamma_table[pixel[1] as usize]; + let b = gamma_table[pixel[2] as usize]; output.put_pixel(x, y, Rgb([r as u8, g as u8, b as u8])); }