diff --git a/Cargo.lock b/Cargo.lock
index c337c95..564a8b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -424,7 +424,10 @@ dependencies = [
  "codspeed-divan-compat",
  "image",
  "image-compare",
+ "jetscii",
+ "memchr",
  "rand",
+ "rayon",
 ]
 
 [[package]]
@@ -654,6 +657,12 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
+[[package]]
+name = "jetscii"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e"
+
 [[package]]
 name = "jobserver"
 version = "0.1.34"
diff --git a/Cargo.toml b/Cargo.toml
index f4172be..705da70 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,9 @@ path = "src/lib.rs"
 rand = "0.8"
 image = "0.25"
 image-compare = "0.5.0"
+rayon = "1.11.0"
+jetscii = { version = "0.5.3" }
+memchr = "2.7.6"
 
 [dev-dependencies]
 divan = { version = "4.0.2", package = "codspeed-divan-compat" }
diff --git a/README.md b/README.md
index 8e9c5be..9e4aa1e 100644
--- a/README.md
+++ b/README.md
@@ -34,3 +34,5 @@ cargo codspeed run -m walltime
 ```
 
 Note: You can also set the `CODSPEED_RUNNER_MODE` environment variable to `walltime` to avoid passing `-m walltime` every time.
+
+
diff --git a/src/bfs.rs b/src/bfs.rs
index 487fddc..3f9e173 100644
--- a/src/bfs.rs
+++ b/src/bfs.rs
@@ -1,4 +1,5 @@
-use std::collections::HashSet;
+use core::hash::{BuildHasherDefault, Hasher};
+use std::collections::{HashSet, VecDeque};
 
 /// A simple graph represented as an adjacency list
 #[derive(Debug, Clone)]
@@ -23,25 +24,67 @@ impl Graph {
     }
 }
 
+pub(crate) type BuildNoHashHasher = BuildHasherDefault<NoHashHasher>;
+
+#[derive(Default)]
+pub(crate) struct NoHashHasher(u64);
+
+impl Hasher for NoHashHasher {
+    fn finish(&self) -> u64 {
+        self.0
+    }
+    fn write(&mut self, _: &[u8]) {
+        unreachable!("Should not be used")
+    }
+    fn write_u8(&mut self, _: u8) {
+        unreachable!("Should not be used")
+    }
+    fn write_u16(&mut self, _: u16) {
+        unreachable!("Should not be used")
+    }
+    fn write_u32(&mut self, _: u32) {
+        unreachable!("Should not be used")
+    }
+    fn write_u64(&mut self, _: u64) {
+        unreachable!("Should not be used")
+    }
+    fn write_usize(&mut self, n: usize) {
+        self.0 = n as u64;
+    }
+    fn write_i8(&mut self, _: i8) {
+        unreachable!("Should not be used")
+    }
+    fn write_i16(&mut self, _: i16) {
+        unreachable!("Should not be used")
+    }
+    fn write_i32(&mut self, _: i32) {
+        unreachable!("Should not be used")
+    }
+    fn write_i64(&mut self, _: i64) {
+        unreachable!("Should not be used")
+    }
+    fn write_isize(&mut self, _: isize) {
+        unreachable!("Should not be used")
+    }
+}
+
 /// Naive BFS implementation using Vec as a queue (intentionally slow)
 /// Returns the order in which nodes were visited
 pub fn bfs_naive(graph: &Graph, start: usize) -> Vec<usize> {
-    let mut visited = HashSet::new();
-    let mut queue = Vec::new(); // Using Vec instead of VecDeque - intentionally inefficient!
+    let mut visited = HashSet::with_capacity_and_hasher(1024, BuildNoHashHasher::new());
+    let mut queue = VecDeque::new(); // Using Vec instead of VecDeque - intentionally inefficient!
     let mut result = Vec::new();
 
-    queue.push(start);
+    queue.push_back(start);
     visited.insert(start);
 
-    while !queue.is_empty() {
-        // remove(0) is O(n) - this makes BFS slow!
-        let node = queue.remove(0);
+    while let Some(node) = queue.pop_front() {
         result.push(node);
 
         if let Some(neighbors) = graph.adjacency.get(node) {
             for &neighbor in neighbors {
                 if visited.insert(neighbor) {
-                    queue.push(neighbor);
+                    queue.push_back(neighbor);
                 }
             }
         }
diff --git a/src/dna_matcher.rs b/src/dna_matcher.rs
index d99c90e..032b3dc 100644
--- a/src/dna_matcher.rs
+++ b/src/dna_matcher.rs
@@ -1,10 +1,15 @@
+use jetscii::ByteSubstring;
+use rayon::prelude::*;
+
 /// Naive approach: Read the entire file as a string and filter lines
-pub fn naive_dna_matcher(genome: &str, pattern: &str) -> Vec<String> {
+pub fn naive_dna_matcher<'a>(genome: &'a str, pattern: &str) -> Vec<&'a str> {
+    let searcher = ByteSubstring::new(pattern.as_bytes());
     genome
-        .lines()
-        .filter(|line| !line.starts_with('>')) // Skip headers
-        .filter(|line| line.contains(pattern))
-        .map(|s| s.to_string())
+        .as_bytes()
+        .par_split(|c| *c == b'\n')
+        .filter(|line| !line.starts_with(b">")) // Skip headers
+        .filter(|line| searcher.find(line).is_some())
+        .map(|s| unsafe { std::str::from_utf8_unchecked(s) })
         .collect()
 }
 
diff --git a/src/lut_filters.rs b/src/lut_filters.rs
index a73068c..24bef78 100644
--- a/src/lut_filters.rs
+++ b/src/lut_filters.rs
@@ -38,48 +38,67 @@ pub fn apply_brightness_contrast_gamma(
 mod naive {
     use super::*;
 
+    const LUT1: [f32; 256] = {
+        let mut data = [0.0; 256];
+        let mut i = 0;
+        while i < 256 {
+            data[i] = i as f32 - 128.0;
+            i += 1;
+        }
+        data
+    };
+
     /// Apply brightness and contrast with floating-point math per pixel
     pub fn apply_brightness_contrast(img: &RgbImage, brightness: i16, contrast: f32) -> RgbImage {
         let (width, height) = img.dimensions();
         let mut output = ImageBuffer::new(width, height);
 
-        for (x, y, pixel) in img.enumerate_pixels() {
-            let r = pixel[0] as f32;
-            let g = pixel[1] as f32;
-            let b = pixel[2] as f32;
+        let mut lut: [u8; 256] = [0; 256];
+        for i in 0..256 {
+            lut[i] =
+                ((LUT1[i] * (1.0 + contrast)) + 128.0 + brightness as f32).clamp(0.0, 255.0) as u8
+        }
 
+        for (x, y, pixel) in img.enumerate_pixels() {
             // Apply contrast and brightness (5 FP ops per channel!)
-            let r = ((r - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32;
-            let g = ((g - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32;
-            let b = ((b - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32;
-
-            output.put_pixel(
-                x,
-                y,
-                Rgb([
-                    r.clamp(0.0, 255.0) as u8,
-                    g.clamp(0.0, 255.0) as u8,
-                    b.clamp(0.0, 255.0) as u8,
-                ]),
-            );
+            let r = lut[pixel[0] as usize];
+            let g = lut[pixel[1] as usize];
+            let b = lut[pixel[2] as usize];
+
+            output.put_pixel(x, y, Rgb([r, g, b]));
         }
 
         output
     }
 
+    const LUT2: [f32; 256] = {
+        let mut data = [0.0; 256];
+        let mut i = 0;
+        while i < 256 {
+            data[i] = i as f32 / 255.0;
+            i += 1;
+        }
+        data
+    };
+
     /// Naive implementation: Apply gamma correction
     /// This is VERY slow because powf() is expensive!
     pub fn apply_gamma(img: &RgbImage, gamma: f32) -> RgbImage {
         let (width, height) = img.dimensions();
         let mut output = ImageBuffer::new(width, height);
 
+        let mut lut: [u8; 256] = [0; 256];
+        for i in 0..256 {
+            lut[i] = (LUT2[i].powf(1.0 / gamma) * 255.0) as u8;
+        }
+
         for (x, y, pixel) in img.enumerate_pixels() {
             // powf() is VERY expensive - this is why we need a LUT!
-            let r = (pixel[0] as f32 / 255.0).powf(1.0 / gamma) * 255.0;
-            let g = (pixel[1] as f32 / 255.0).powf(1.0 / gamma) * 255.0;
-            let b = (pixel[2] as f32 / 255.0).powf(1.0 / gamma) * 255.0;
+            let r = lut[pixel[0] as usize];
+            let g = lut[pixel[1] as usize];
+            let b = lut[pixel[2] as usize];
 
-            output.put_pixel(x, y, Rgb([r as u8, g as u8, b as u8]));
+            output.put_pixel(x, y, Rgb([r, g, b]));
         }
 
         output