Skip to content

Commit d5b7a81

Browse files
perf(tolerance): add tiered matching with early exit optimization
- Add TieredMatchResult struct with tier and early_exit tracking - Implement match_tiered() for progressive tolerance application - Early exit when exactly 1 match found at any tier - Pre-process content lines once per tier (avoid recomputation) Performance improvements: - Tier 0: Exact match → immediate return (no tolerance overhead) - Tier 1-5: Progressive tolerance with early exit - Avoids processing all 5 tolerance levels when not needed 8 new tests covering: - Early exit at each tier (exact, rstrip, lstrip, linenums, unicode) - Multiple matches (ambiguous case) - No match scenario - Performance with large content (1000 lines) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 22a9c70 commit d5b7a81

1 file changed

Lines changed: 367 additions & 0 deletions

File tree

src/utils/tolerance.rs

Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@
33
//! This module implements a 5-level tolerance system for matching search blocks
44
//! in file content, inspired by the wcgw Python implementation.
55
//!
6+
//! ## Optimization: Tiered Matching with Early Exit (v0.2.2)
7+
//!
8+
//! The matching algorithm uses a tiered approach with early exit:
9+
//! - **Tier 0**: Exact match (fastest, no tolerance)
10+
//! - **Tier 1-5**: Progressive tolerance application with early exit
11+
//!
12+
//! When a unique match is found at any tier, processing stops immediately.
13+
//! This dramatically reduces processing time for well-formatted code.
14+
//!
615
//! Tolerance levels (applied in order):
716
//! 1. `rstrip` - Remove trailing whitespace (SILENT)
817
//! 2. `lstrip` - Remove leading indentation (WARNING, score 10x)
@@ -301,6 +310,248 @@ pub fn match_exact(content_lines: &[&str], content_offset: usize, search_lines:
301310
find_contiguous_matches(&search_line_positions)
302311
}
303312

313+
// ============================================================================
314+
// Tiered Matching with Early Exit (Optimized)
315+
// ============================================================================
316+
317+
/// Result of tiered matching - includes which tier found the match
318+
#[derive(Debug, Clone)]
319+
pub struct TieredMatchResult {
320+
/// The match result
321+
pub result: MatchResult,
322+
/// Which tier (0-5) found the match (0 = exact, 1-5 = tolerance levels)
323+
pub tier: usize,
324+
/// Whether early exit was triggered
325+
pub early_exit: bool,
326+
}
327+
328+
/// Tiered matching with early exit optimization
329+
///
330+
/// This is the PREFERRED matching function for performance.
331+
/// It applies tolerances progressively and exits as soon as a unique match is found.
332+
///
333+
/// # Algorithm
334+
/// 1. Try exact match first (Tier 0)
335+
/// 2. If no unique match, apply tolerance level 1 (rstrip)
336+
/// 3. If no unique match, apply tolerance level 2 (lstrip)
337+
/// 4. Continue until unique match found OR all tiers exhausted
338+
///
339+
/// # Early Exit Conditions
340+
/// - Exactly 1 match found at current tier → return immediately
341+
/// - 0 matches at all tiers → return empty
342+
/// - Multiple matches at final tier → return all (caller handles ambiguity)
343+
pub fn match_tiered(
344+
content_lines: &[&str],
345+
content_offset: usize,
346+
search_lines: &[&str],
347+
) -> Vec<TieredMatchResult> {
348+
let n_search = search_lines.len();
349+
let n_content = content_lines.len();
350+
351+
if n_search == 0 || n_content == 0 || n_search > n_content - content_offset {
352+
return vec![];
353+
}
354+
355+
// Tier 0: Exact match (fastest path)
356+
let exact_matches = match_exact(content_lines, content_offset, search_lines);
357+
if exact_matches.len() == 1 {
358+
let (start, end) = exact_matches[0];
359+
debug!("Tiered match: early exit at Tier 0 (exact match)");
360+
return vec![TieredMatchResult {
361+
result: MatchResult {
362+
matched_slice: (start, end),
363+
line_range: (start + 1, end),
364+
tolerances_hit: vec![],
365+
score: 0.0,
366+
warnings: HashSet::new(),
367+
matched_lines: content_lines[start..end]
368+
.iter()
369+
.map(|s| (*s).to_string())
370+
.collect(),
371+
removed_indentation: None,
372+
},
373+
tier: 0,
374+
early_exit: true,
375+
}];
376+
}
377+
378+
// If multiple exact matches, return all (ambiguous)
379+
if exact_matches.len() > 1 {
380+
debug!("Tiered match: {} exact matches (ambiguous)", exact_matches.len());
381+
return exact_matches
382+
.into_iter()
383+
.map(|(start, end)| TieredMatchResult {
384+
result: MatchResult {
385+
matched_slice: (start, end),
386+
line_range: (start + 1, end),
387+
tolerances_hit: vec![],
388+
score: 0.0,
389+
warnings: HashSet::new(),
390+
matched_lines: content_lines[start..end]
391+
.iter()
392+
.map(|s| (*s).to_string())
393+
.collect(),
394+
removed_indentation: None,
395+
},
396+
tier: 0,
397+
early_exit: false,
398+
})
399+
.collect();
400+
}
401+
402+
// Tier 1-5: Apply tolerances progressively with early exit
403+
let tolerances = default_tolerances();
404+
405+
// Pre-process content lines once (avoid repeated allocations)
406+
let content_processed: Vec<Vec<String>> = tolerances
407+
.iter()
408+
.map(|tol| {
409+
content_lines
410+
.iter()
411+
.skip(content_offset)
412+
.map(|line| (tol.line_process)(line))
413+
.collect()
414+
})
415+
.collect();
416+
417+
// Try each tolerance tier
418+
for (tier_idx, tolerance) in tolerances.iter().enumerate() {
419+
let tier = tier_idx + 1; // Tier 1-5 (0 was exact)
420+
421+
// Process search lines with this tolerance
422+
let search_processed: Vec<String> = search_lines
423+
.iter()
424+
.map(|line| (tolerance.line_process)(line))
425+
.collect();
426+
427+
// Build position map for this tier
428+
let mut content_positions: HashMap<&str, HashSet<usize>> = HashMap::new();
429+
for (i, processed_line) in content_processed[tier_idx].iter().enumerate() {
430+
content_positions
431+
.entry(processed_line.as_str())
432+
.or_default()
433+
.insert(i + content_offset);
434+
}
435+
436+
// Get positions for each search line
437+
let search_line_positions: Vec<HashSet<usize>> = search_processed
438+
.iter()
439+
.map(|line| {
440+
content_positions
441+
.get(line.as_str())
442+
.cloned()
443+
.unwrap_or_default()
444+
})
445+
.collect();
446+
447+
// Find contiguous matches at this tier
448+
let matched_slices = find_contiguous_matches(&search_line_positions);
449+
450+
// Early exit: exactly 1 match found
451+
if matched_slices.len() == 1 {
452+
let (start, end) = matched_slices[0];
453+
debug!("Tiered match: early exit at Tier {} ({})", tier, tolerance.error_name);
454+
455+
let matched_lines: Vec<String> = content_lines[start..end]
456+
.iter()
457+
.map(|s| (*s).to_string())
458+
.collect();
459+
460+
// Check for removed indentation
461+
let removed_indentation = if tolerance.error_name == REMOVE_INDENTATION_WARNING {
462+
matched_lines
463+
.iter()
464+
.filter(|l| !l.trim().is_empty())
465+
.map(|l| get_removed_indentation(l))
466+
.next()
467+
} else {
468+
None
469+
};
470+
471+
let mut warnings = HashSet::new();
472+
if tolerance.severity == ToleranceSeverity::Warning && !tolerance.error_name.is_empty() {
473+
warnings.insert(tolerance.error_name.to_string());
474+
}
475+
476+
return vec![TieredMatchResult {
477+
result: MatchResult {
478+
matched_slice: (start, end),
479+
line_range: (start + 1, end),
480+
tolerances_hit: vec![ToleranceHit {
481+
tolerance_index: tier_idx,
482+
severity: tolerance.severity,
483+
score_multiplier: tolerance.score_multiplier,
484+
error_name: tolerance.error_name,
485+
count: n_search,
486+
}],
487+
score: tolerance.score_multiplier * n_search as f64,
488+
warnings,
489+
matched_lines,
490+
removed_indentation,
491+
},
492+
tier,
493+
early_exit: true,
494+
}];
495+
}
496+
497+
// Multiple matches at this tier - continue to next tier for more specificity
498+
// (unless this is the last tier)
499+
if !matched_slices.is_empty() && tier == tolerances.len() {
500+
trace!("Tiered match: {} matches at final tier {}", matched_slices.len(), tier);
501+
return matched_slices
502+
.into_iter()
503+
.map(|(start, end)| {
504+
let matched_lines: Vec<String> = content_lines[start..end]
505+
.iter()
506+
.map(|s| (*s).to_string())
507+
.collect();
508+
509+
let removed_indentation = if tolerance.error_name == REMOVE_INDENTATION_WARNING {
510+
matched_lines
511+
.iter()
512+
.filter(|l| !l.trim().is_empty())
513+
.map(|l| get_removed_indentation(l))
514+
.next()
515+
} else {
516+
None
517+
};
518+
519+
let mut warnings = HashSet::new();
520+
if tolerance.severity == ToleranceSeverity::Warning
521+
&& !tolerance.error_name.is_empty()
522+
{
523+
warnings.insert(tolerance.error_name.to_string());
524+
}
525+
526+
TieredMatchResult {
527+
result: MatchResult {
528+
matched_slice: (start, end),
529+
line_range: (start + 1, end),
530+
tolerances_hit: vec![ToleranceHit {
531+
tolerance_index: tier_idx,
532+
severity: tolerance.severity,
533+
score_multiplier: tolerance.score_multiplier,
534+
error_name: tolerance.error_name,
535+
count: n_search,
536+
}],
537+
score: tolerance.score_multiplier * n_search as f64,
538+
warnings,
539+
matched_lines,
540+
removed_indentation,
541+
},
542+
tier,
543+
early_exit: false,
544+
}
545+
})
546+
.collect();
547+
}
548+
}
549+
550+
// No matches found at any tier
551+
trace!("Tiered match: no matches found at any tier");
552+
vec![]
553+
}
554+
304555
/// Match with tolerances applied
305556
pub fn match_with_tolerance(
306557
content_lines: &[&str],
@@ -1089,4 +1340,120 @@ mod tests {
10891340
assert!(result.content.contains(" fn new_test()"));
10901341
assert!(result.content.contains(" new_code()"));
10911342
}
1343+
1344+
// ========================================================================
1345+
// Tiered Matching with Early Exit Tests
1346+
// ========================================================================
1347+
1348+
#[test]
1349+
fn test_tiered_exact_match_early_exit() {
1350+
// Exact match should return at Tier 0 with early_exit = true
1351+
let content = vec!["line1", "line2", "line3"];
1352+
let search = vec!["line1", "line2"];
1353+
1354+
let results = match_tiered(&content, 0, &search);
1355+
1356+
assert_eq!(results.len(), 1);
1357+
assert_eq!(results[0].tier, 0); // Tier 0 = exact match
1358+
assert!(results[0].early_exit);
1359+
assert_eq!(results[0].result.matched_slice, (0, 2));
1360+
assert_eq!(results[0].result.score, 0.0); // No tolerance used
1361+
}
1362+
1363+
#[test]
1364+
fn test_tiered_rstrip_early_exit() {
1365+
// Content has trailing whitespace - should match at Tier 1 (rstrip)
1366+
let content = vec!["line1 ", "line2 "];
1367+
let search = vec!["line1", "line2"];
1368+
1369+
let results = match_tiered(&content, 0, &search);
1370+
1371+
assert_eq!(results.len(), 1);
1372+
assert_eq!(results[0].tier, 1); // Tier 1 = rstrip
1373+
assert!(results[0].early_exit);
1374+
assert_eq!(results[0].result.score, 2.0); // 1.0 * 2 lines
1375+
}
1376+
1377+
#[test]
1378+
fn test_tiered_lstrip_early_exit() {
1379+
// Content has different indentation - should match at Tier 2 (lstrip)
1380+
let content = vec![" fn foo() {", " bar()", " }"];
1381+
let search = vec!["fn foo() {", " bar()", "}"];
1382+
1383+
let results = match_tiered(&content, 0, &search);
1384+
1385+
assert_eq!(results.len(), 1);
1386+
assert_eq!(results[0].tier, 2); // Tier 2 = lstrip
1387+
assert!(results[0].early_exit);
1388+
assert!(results[0].result.warnings.contains(REMOVE_INDENTATION_WARNING));
1389+
}
1390+
1391+
#[test]
1392+
fn test_tiered_line_nums_early_exit() {
1393+
// Search has line numbers - should match at Tier 3
1394+
let content = vec!["fn main() {", " println!(\"hello\");", "}"];
1395+
let search = vec!["1 fn main() {", "2 println!(\"hello\");", "3 }"];
1396+
1397+
let results = match_tiered(&content, 0, &search);
1398+
1399+
assert_eq!(results.len(), 1);
1400+
assert_eq!(results[0].tier, 3); // Tier 3 = remove line nums
1401+
assert!(results[0].early_exit);
1402+
assert!(results[0].result.warnings.contains(REMOVE_LINE_NUMS_WARNING));
1403+
}
1404+
1405+
#[test]
1406+
fn test_tiered_unicode_early_exit() {
1407+
// Search has unicode quotes - should match at Tier 4
1408+
let content = vec!["let x = \"hello\";"];
1409+
let search = vec!["let x = \u{201C}hello\u{201D};"]; // Smart quotes
1410+
1411+
let results = match_tiered(&content, 0, &search);
1412+
1413+
assert_eq!(results.len(), 1);
1414+
assert_eq!(results[0].tier, 4); // Tier 4 = normalize unicode
1415+
assert!(results[0].early_exit);
1416+
assert!(results[0].result.warnings.contains(NORMALIZE_CHARS_WARNING));
1417+
}
1418+
1419+
#[test]
1420+
fn test_tiered_multiple_exact_matches() {
1421+
// Multiple exact matches - should return all with early_exit = false
1422+
let content = vec!["x = 1", "y = 2", "x = 1"];
1423+
let search = vec!["x = 1"];
1424+
1425+
let results = match_tiered(&content, 0, &search);
1426+
1427+
assert_eq!(results.len(), 2); // Two matches
1428+
assert!(results.iter().all(|r| r.tier == 0)); // All at Tier 0
1429+
assert!(results.iter().all(|r| !r.early_exit)); // No early exit (ambiguous)
1430+
}
1431+
1432+
#[test]
1433+
fn test_tiered_no_match() {
1434+
// No match at any tier
1435+
let content = vec!["completely", "different", "content"];
1436+
let search = vec!["not", "found"];
1437+
1438+
let results = match_tiered(&content, 0, &search);
1439+
1440+
assert!(results.is_empty());
1441+
}
1442+
1443+
#[test]
1444+
fn test_tiered_performance_exact_match() {
1445+
// Large content with exact match should return immediately
1446+
let mut content: Vec<&str> = (0..1000).map(|_| "filler line").collect();
1447+
content[500] = "target line 1";
1448+
content[501] = "target line 2";
1449+
1450+
let search = vec!["target line 1", "target line 2"];
1451+
1452+
let results = match_tiered(&content, 0, &search);
1453+
1454+
assert_eq!(results.len(), 1);
1455+
assert_eq!(results[0].tier, 0); // Found at Tier 0
1456+
assert!(results[0].early_exit);
1457+
assert_eq!(results[0].result.matched_slice, (500, 502));
1458+
}
10921459
}

0 commit comments

Comments
 (0)