Skip to content

Commit 831ef6a

Browse files
refactor!: remove MockEmbedder from public API
MockEmbedder was a 64-dim keyword-bag helper exposed in the public API since the prototype era. It was never a real embedder — its presence muddled the public surface (consumers could accidentally pick it as a CLI default, fixture configs could opaquely use it for production), and the 'no mocks unless absolutely necessary' rule never applied to it cleanly because it lived in src/. Lifted to tests/common/test_embedder.rs as BagOfWordsEmbedder. Visible only to integration tests; never to consumers, never to the CLI, never to the published crate. BREAKING CHANGES (pre-publication cleanup): - pub MockEmbedder removed from src/embedding.rs - CLI --embedder mock flag removed; default is now --embedder fastembed - EvalSuite no longer accepts embedding_model = "mock" in router.toml - RouterConfig::default_config now uses "fastembed/AllMiniLML6V2" Added: EvalSuite::from_dir_with_embedder(path, embedder) for tests that want to inject a fast deterministic embedder explicitly. scoring.rs unit test uses a local InlineKwEmbed (not reachable from tests/common/ in src unit tests) with identical routing semantics.
1 parent 2a64702 commit 831ef6a

12 files changed

Lines changed: 321 additions & 230 deletions

CHANGELOG.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@ with a 0.x convention: breaking changes can land on minor bumps until 1.0.0.
88

99
## [Unreleased]
1010

11+
### Removed (BREAKING: pre-publication cleanup)
12+
- `MockEmbedder` removed from public API entirely. It was a 64-dim keyword-bag
13+
helper for testing routing math, not a real embedder, and its presence in the
14+
public surface invited misuse (e.g. accidental CLI default, misleading first
15+
impressions). Lifted to `tests/common/test_embedder.rs` as `BagOfWordsEmbedder`,
16+
visible only to integration tests.
17+
- CLI `--embedder mock` flag removed; default is now `--embedder fastembed`.
18+
- `EvalSuite`'s `embedding_model = "mock"` config string no longer accepted.
19+
20+
### Added
21+
- `EvalSuite::from_dir_with_embedder(path, Box<dyn EmbeddingProvider>)` for
22+
tests that need to inject a fast deterministic embedder without the fastembed
23+
model download.
24+
1125
## [0.1.1] - 2026-05-07
1226

1327
First public release on crates.io. Slimmed dep graph (254 → ~21 lean / ~210 default), polished public API.
@@ -44,7 +58,7 @@ First public release on crates.io. Slimmed dep graph (254 → ~21 lean / ~210 de
4458
30-second example, BYO-embedder example, decision JSON, contract testing,
4559
CLI, real performance numbers, status, roadmap.
4660
- `CONTRIBUTING.md` and issue/PR templates.
47-
- `src/time_util.rs` std-only ISO-8601 + compact timestamp formatters,
61+
- `src/time_util.rs`: std-only ISO-8601 + compact timestamp formatters,
4862
replacing chrono. Hinnant's `civil_from_days` algorithm.
4963

5064
### Migration from 0.1.0 (internal preview)

src/config.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,15 @@ impl RouterConfig {
111111
Ok(config)
112112
}
113113

114-
/// Return a sensible default config suitable for testing with `MockEmbedder`.
114+
/// Return a sensible default config. Useful in tests where the caller
115+
/// supplies a custom [`EmbeddingProvider`](crate::embedding::EmbeddingProvider)
116+
/// directly rather than relying on auto-construction from `embedding_model`.
115117
pub fn default_config() -> Self {
116118
RouterConfig {
117119
router: RouterSection {
118120
name: "semrouter".to_string(),
119121
version: "0.1.0".to_string(),
120-
embedding_model: "mock".to_string(),
122+
embedding_model: "fastembed/AllMiniLML6V2".to_string(),
121123
vector_dimension: 384,
122124
similarity: "cosine".to_string(),
123125
top_k: 3,

src/embedding.rs

Lines changed: 0 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -51,144 +51,6 @@ pub trait EmbeddingProvider {
5151
fn dimension(&self) -> usize;
5252
}
5353

54-
// ── MockEmbedder ──────────────────────────────────────────────────────────────
55-
56-
/// Maps keywords to fixed embedding dimensions so different semantic domains
57-
/// occupy non-overlapping dimension ranges, enabling reliable cosine similarity.
58-
const VOCAB: &[(&str, usize)] = &[
59-
// coding (dims 0-15)
60-
("debug", 0),
61-
("code", 1),
62-
("error", 2),
63-
("function", 3),
64-
("test", 4),
65-
("implement", 5),
66-
("fix", 6),
67-
("rust", 7),
68-
("python", 8),
69-
("javascript", 9),
70-
("refactor", 10),
71-
("compile", 11),
72-
("variable", 12),
73-
("class", 13),
74-
("syntax", 14),
75-
("module", 15),
76-
// second brain (dims 16-31)
77-
("save", 16),
78-
("brain", 17),
79-
("note", 18),
80-
("knowledge", 19),
81-
("capture", 20),
82-
("store", 21),
83-
("idea", 22),
84-
("memory", 23),
85-
("archive", 24),
86-
("organize", 25),
87-
("file", 26),
88-
("link", 27),
89-
("thought", 28),
90-
("insight", 29),
91-
("category", 30),
92-
("tag", 31),
93-
// research (dims 32-47)
94-
("research", 32),
95-
("find", 33),
96-
("look", 34),
97-
("search", 35),
98-
("information", 36),
99-
("learn", 37),
100-
("understand", 38),
101-
("explain", 39),
102-
("study", 40),
103-
("read", 41),
104-
("paper", 42),
105-
("article", 43),
106-
("data", 44),
107-
("source", 45),
108-
("evidence", 46),
109-
("review", 47),
110-
// model/task routing (dims 48-63)
111-
("complex", 48),
112-
("simple", 49),
113-
("reasoning", 50),
114-
("quick", 51),
115-
("expensive", 52),
116-
("cheap", 53),
117-
("fast", 54),
118-
("slow", 55),
119-
("creative", 56),
120-
("analytical", 57),
121-
("generate", 58),
122-
("summarize", 59),
123-
("analyze", 60),
124-
("strategy", 61),
125-
("plan", 62),
126-
("decide", 63),
127-
];
128-
129-
const DIM: usize = 64;
130-
131-
/// Deterministic keyword-bag embedder for tests; not suitable for production use.
132-
pub struct MockEmbedder;
133-
134-
impl Default for MockEmbedder {
135-
fn default() -> Self {
136-
MockEmbedder
137-
}
138-
}
139-
140-
impl MockEmbedder {
141-
/// Create a new `MockEmbedder`.
142-
pub fn new() -> Self {
143-
MockEmbedder
144-
}
145-
}
146-
147-
impl EmbeddingProvider for MockEmbedder {
148-
fn embed(&self, text: &str) -> Result<Vec<f32>, RouterError> {
149-
let lower = text.to_lowercase();
150-
let words: Vec<&str> = lower.split_whitespace().collect();
151-
let mut vec = vec![0.0f32; DIM];
152-
153-
for word in &words {
154-
let word = word.trim_matches(|c: char| !c.is_alphanumeric());
155-
for &(keyword, dim) in VOCAB {
156-
if word == keyword || word.starts_with(keyword) {
157-
vec[dim] += 1.0;
158-
}
159-
}
160-
}
161-
162-
// Small per-text noise so identical-keyword texts can still be distinguished
163-
let hash = fnv_hash(text);
164-
for (i, slot) in vec.iter_mut().enumerate() {
165-
let noise = ((hash
166-
.wrapping_add(i as u64)
167-
.wrapping_mul(6364136223846793005))
168-
>> 33) as f32
169-
/ (u32::MAX as f32)
170-
* 0.05;
171-
*slot += noise;
172-
}
173-
174-
normalize(&mut vec);
175-
Ok(vec)
176-
}
177-
178-
fn dimension(&self) -> usize {
179-
DIM
180-
}
181-
}
182-
183-
fn fnv_hash(s: &str) -> u64 {
184-
let mut hash: u64 = 14695981039346656037;
185-
for byte in s.bytes() {
186-
hash = hash.wrapping_mul(1099511628211);
187-
hash ^= byte as u64;
188-
}
189-
hash
190-
}
191-
19254
// ── Math helpers ──────────────────────────────────────────────────────────────
19355

19456
/// Normalize a vector in-place to unit length; no-op if the norm is near zero.
@@ -207,41 +69,3 @@ pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
20769
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
20870
}
20971

210-
#[cfg(test)]
211-
mod tests {
212-
use super::*;
213-
214-
#[test]
215-
fn mock_embedder_is_deterministic() {
216-
let e = MockEmbedder::new();
217-
let v1 = e.embed("Help me debug this Python error").unwrap();
218-
let v2 = e.embed("Help me debug this Python error").unwrap();
219-
assert_eq!(v1, v2);
220-
}
221-
222-
#[test]
223-
fn mock_embedder_produces_normalized_vector() {
224-
let e = MockEmbedder::new();
225-
let v = e.embed("Save this idea to my second brain").unwrap();
226-
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
227-
assert!(
228-
(norm - 1.0).abs() < 1e-5,
229-
"Expected unit vector, got norm={norm}"
230-
);
231-
}
232-
233-
#[test]
234-
fn coding_texts_are_more_similar_to_each_other() {
235-
let e = MockEmbedder::new();
236-
let coding1 = e.embed("Help me debug this Python error").unwrap();
237-
let coding2 = e.embed("Fix this Rust compile error in my code").unwrap();
238-
let brain = e.embed("Save this idea to my second brain").unwrap();
239-
240-
let sim_same = cosine_similarity(&coding1, &coding2);
241-
let sim_diff = cosine_similarity(&coding1, &brain);
242-
assert!(
243-
sim_same > sim_diff,
244-
"same-domain sim={sim_same} should > cross-domain sim={sim_diff}"
245-
);
246-
}
247-
}

src/main.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@ use std::path::{Path, PathBuf};
55
use clap::{Parser, Subcommand, ValueEnum};
66
use semrouter::{
77
config::RouterConfig,
8-
embedding::{EmbeddingProvider, FastEmbedEmbedder, MockEmbedder},
8+
embedding::{EmbeddingProvider, FastEmbedEmbedder},
99
eval::{load_eval_cases, run_eval, EvalMetrics},
1010
experiment::ExperimentResult,
1111
SemanticRouter,
1212
};
1313

1414
#[derive(Debug, Clone, ValueEnum)]
1515
enum EmbedderType {
16-
Mock,
1716
Fastembed,
1817
}
1918

@@ -38,8 +37,8 @@ struct Cli {
3837
#[arg(long, default_value = "routes.jsonl")]
3938
routes: PathBuf,
4039

41-
/// Embedder backend: mock (keyword-based, no network) or fastembed (local ONNX)
42-
#[arg(long, default_value = "mock", value_enum)]
40+
/// Embedder backend: fastembed (local ONNX, default)
41+
#[arg(long, default_value = "fastembed", value_enum)]
4342
embedder: EmbedderType,
4443

4544
#[command(subcommand)]
@@ -83,7 +82,6 @@ enum Commands {
8382

8483
fn build_embedder(embedder_type: &EmbedderType) -> Result<Box<dyn EmbeddingProvider>, String> {
8584
match embedder_type {
86-
EmbedderType::Mock => Ok(Box::new(MockEmbedder::new())),
8785
EmbedderType::Fastembed => FastEmbedEmbedder::new()
8886
.map(|e| Box::new(e) as Box<dyn EmbeddingProvider>)
8987
.map_err(|e| format!("Failed to create fastembed embedder: {e}")),
@@ -92,7 +90,6 @@ fn build_embedder(embedder_type: &EmbedderType) -> Result<Box<dyn EmbeddingProvi
9290

9391
fn embedder_label(t: &EmbedderType) -> &'static str {
9492
match t {
95-
EmbedderType::Mock => "mock",
9693
EmbedderType::Fastembed => "fastembed",
9794
}
9895
}

src/scoring.rs

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,40 @@ pub fn score_routes(
8181
#[cfg(test)]
8282
mod tests {
8383
use super::*;
84-
use crate::embedding::{EmbeddingProvider, MockEmbedder};
84+
use crate::embedding::{normalize, EmbeddingProvider};
85+
use crate::error::RouterError;
8586
use crate::route::{EmbeddedExample, RiskLevel, RouteExample};
8687

88+
// Minimal inline keyword-bag embedder for this unit test only.
89+
// Keeps scoring.rs free of a dependency on the test-only BagOfWordsEmbedder
90+
// in tests/common/ (which is not reachable from src/ unit tests).
91+
struct InlineKwEmbed;
92+
impl EmbeddingProvider for InlineKwEmbed {
93+
fn embed(&self, text: &str) -> Result<Vec<f32>, RouterError> {
94+
const KW: &[(&str, usize)] = &[
95+
("debug", 0), ("code", 1), ("error", 2), ("fix", 6), ("rust", 7),
96+
("python", 8), ("compile", 11), ("test", 4),
97+
("save", 16), ("brain", 17), ("note", 18), ("knowledge", 19),
98+
("capture", 20), ("store", 21), ("idea", 22), ("insight", 29),
99+
];
100+
let lower = text.to_lowercase();
101+
let mut v = vec![0.0f32; 64];
102+
for word in lower.split_whitespace() {
103+
let word = word.trim_matches(|c: char| !c.is_alphanumeric());
104+
for &(kw, dim) in KW {
105+
if word == kw || word.starts_with(kw) {
106+
v[dim] += 1.0;
107+
}
108+
}
109+
}
110+
normalize(&mut v);
111+
Ok(v)
112+
}
113+
fn dimension(&self) -> usize { 64 }
114+
}
115+
87116
fn make_example(id: &str, route: &str, text: &str) -> EmbeddedExample {
88-
let e = MockEmbedder::new();
89-
let embedding = e.embed(text).unwrap();
117+
let embedding = InlineKwEmbed.embed(text).unwrap();
90118
EmbeddedExample {
91119
example: RouteExample {
92120
id: id.to_string(),
@@ -122,8 +150,7 @@ mod tests {
122150
),
123151
];
124152

125-
let embedder = MockEmbedder::new();
126-
let input = embedder.embed("debug this code error in python").unwrap();
153+
let input = InlineKwEmbed.embed("debug this code error in python").unwrap();
127154
let candidates = score_routes(&input, &examples, 3, &[], 0.0);
128155

129156
assert!(!candidates.is_empty());

0 commit comments

Comments
 (0)