|
3 | 3 | //! This module implements a 5-level tolerance system for matching search blocks |
4 | 4 | //! in file content, inspired by the wcgw Python implementation. |
5 | 5 | //! |
| 6 | +//! ## Optimization: Tiered Matching with Early Exit (v0.2.2) |
| 7 | +//! |
| 8 | +//! The matching algorithm uses a tiered approach with early exit: |
| 9 | +//! - **Tier 0**: Exact match (fastest, no tolerance) |
| 10 | +//! - **Tier 1-5**: Progressive tolerance application with early exit |
| 11 | +//! |
| 12 | +//! When a unique match is found at any tier, processing stops immediately. |
| 13 | +//! This dramatically reduces processing time for well-formatted code. |
| 14 | +//! |
6 | 15 | //! Tolerance levels (applied in order): |
7 | 16 | //! 1. `rstrip` - Remove trailing whitespace (SILENT) |
8 | 17 | //! 2. `lstrip` - Remove leading indentation (WARNING, score 10x) |
@@ -301,6 +310,248 @@ pub fn match_exact(content_lines: &[&str], content_offset: usize, search_lines: |
301 | 310 | find_contiguous_matches(&search_line_positions) |
302 | 311 | } |
303 | 312 |
|
| 313 | +// ============================================================================ |
| 314 | +// Tiered Matching with Early Exit (Optimized) |
| 315 | +// ============================================================================ |
| 316 | + |
| 317 | +/// Result of tiered matching - includes which tier found the match |
| 318 | +#[derive(Debug, Clone)] |
| 319 | +pub struct TieredMatchResult { |
| 320 | + /// The match result |
| 321 | + pub result: MatchResult, |
| 322 | + /// Which tier (0-5) found the match (0 = exact, 1-5 = tolerance levels) |
| 323 | + pub tier: usize, |
| 324 | + /// Whether early exit was triggered |
| 325 | + pub early_exit: bool, |
| 326 | +} |
| 327 | + |
| 328 | +/// Tiered matching with early exit optimization |
| 329 | +/// |
| 330 | +/// This is the PREFERRED matching function for performance. |
| 331 | +/// It applies tolerances progressively and exits as soon as a unique match is found. |
| 332 | +/// |
| 333 | +/// # Algorithm |
| 334 | +/// 1. Try exact match first (Tier 0) |
| 335 | +/// 2. If no unique match, apply tolerance level 1 (rstrip) |
| 336 | +/// 3. If no unique match, apply tolerance level 2 (lstrip) |
| 337 | +/// 4. Continue until unique match found OR all tiers exhausted |
| 338 | +/// |
| 339 | +/// # Early Exit Conditions |
| 340 | +/// - Exactly 1 match found at current tier → return immediately |
| 341 | +/// - 0 matches at all tiers → return empty |
| 342 | +/// - Multiple matches at final tier → return all (caller handles ambiguity) |
| 343 | +pub fn match_tiered( |
| 344 | + content_lines: &[&str], |
| 345 | + content_offset: usize, |
| 346 | + search_lines: &[&str], |
| 347 | +) -> Vec<TieredMatchResult> { |
| 348 | + let n_search = search_lines.len(); |
| 349 | + let n_content = content_lines.len(); |
| 350 | + |
| 351 | + if n_search == 0 || n_content == 0 || n_search > n_content - content_offset { |
| 352 | + return vec![]; |
| 353 | + } |
| 354 | + |
| 355 | + // Tier 0: Exact match (fastest path) |
| 356 | + let exact_matches = match_exact(content_lines, content_offset, search_lines); |
| 357 | + if exact_matches.len() == 1 { |
| 358 | + let (start, end) = exact_matches[0]; |
| 359 | + debug!("Tiered match: early exit at Tier 0 (exact match)"); |
| 360 | + return vec![TieredMatchResult { |
| 361 | + result: MatchResult { |
| 362 | + matched_slice: (start, end), |
| 363 | + line_range: (start + 1, end), |
| 364 | + tolerances_hit: vec![], |
| 365 | + score: 0.0, |
| 366 | + warnings: HashSet::new(), |
| 367 | + matched_lines: content_lines[start..end] |
| 368 | + .iter() |
| 369 | + .map(|s| (*s).to_string()) |
| 370 | + .collect(), |
| 371 | + removed_indentation: None, |
| 372 | + }, |
| 373 | + tier: 0, |
| 374 | + early_exit: true, |
| 375 | + }]; |
| 376 | + } |
| 377 | + |
| 378 | + // If multiple exact matches, return all (ambiguous) |
| 379 | + if exact_matches.len() > 1 { |
| 380 | + debug!("Tiered match: {} exact matches (ambiguous)", exact_matches.len()); |
| 381 | + return exact_matches |
| 382 | + .into_iter() |
| 383 | + .map(|(start, end)| TieredMatchResult { |
| 384 | + result: MatchResult { |
| 385 | + matched_slice: (start, end), |
| 386 | + line_range: (start + 1, end), |
| 387 | + tolerances_hit: vec![], |
| 388 | + score: 0.0, |
| 389 | + warnings: HashSet::new(), |
| 390 | + matched_lines: content_lines[start..end] |
| 391 | + .iter() |
| 392 | + .map(|s| (*s).to_string()) |
| 393 | + .collect(), |
| 394 | + removed_indentation: None, |
| 395 | + }, |
| 396 | + tier: 0, |
| 397 | + early_exit: false, |
| 398 | + }) |
| 399 | + .collect(); |
| 400 | + } |
| 401 | + |
| 402 | + // Tier 1-5: Apply tolerances progressively with early exit |
| 403 | + let tolerances = default_tolerances(); |
| 404 | + |
| 405 | + // Pre-process content lines once (avoid repeated allocations) |
| 406 | + let content_processed: Vec<Vec<String>> = tolerances |
| 407 | + .iter() |
| 408 | + .map(|tol| { |
| 409 | + content_lines |
| 410 | + .iter() |
| 411 | + .skip(content_offset) |
| 412 | + .map(|line| (tol.line_process)(line)) |
| 413 | + .collect() |
| 414 | + }) |
| 415 | + .collect(); |
| 416 | + |
| 417 | + // Try each tolerance tier |
| 418 | + for (tier_idx, tolerance) in tolerances.iter().enumerate() { |
| 419 | + let tier = tier_idx + 1; // Tier 1-5 (0 was exact) |
| 420 | + |
| 421 | + // Process search lines with this tolerance |
| 422 | + let search_processed: Vec<String> = search_lines |
| 423 | + .iter() |
| 424 | + .map(|line| (tolerance.line_process)(line)) |
| 425 | + .collect(); |
| 426 | + |
| 427 | + // Build position map for this tier |
| 428 | + let mut content_positions: HashMap<&str, HashSet<usize>> = HashMap::new(); |
| 429 | + for (i, processed_line) in content_processed[tier_idx].iter().enumerate() { |
| 430 | + content_positions |
| 431 | + .entry(processed_line.as_str()) |
| 432 | + .or_default() |
| 433 | + .insert(i + content_offset); |
| 434 | + } |
| 435 | + |
| 436 | + // Get positions for each search line |
| 437 | + let search_line_positions: Vec<HashSet<usize>> = search_processed |
| 438 | + .iter() |
| 439 | + .map(|line| { |
| 440 | + content_positions |
| 441 | + .get(line.as_str()) |
| 442 | + .cloned() |
| 443 | + .unwrap_or_default() |
| 444 | + }) |
| 445 | + .collect(); |
| 446 | + |
| 447 | + // Find contiguous matches at this tier |
| 448 | + let matched_slices = find_contiguous_matches(&search_line_positions); |
| 449 | + |
| 450 | + // Early exit: exactly 1 match found |
| 451 | + if matched_slices.len() == 1 { |
| 452 | + let (start, end) = matched_slices[0]; |
| 453 | + debug!("Tiered match: early exit at Tier {} ({})", tier, tolerance.error_name); |
| 454 | + |
| 455 | + let matched_lines: Vec<String> = content_lines[start..end] |
| 456 | + .iter() |
| 457 | + .map(|s| (*s).to_string()) |
| 458 | + .collect(); |
| 459 | + |
| 460 | + // Check for removed indentation |
| 461 | + let removed_indentation = if tolerance.error_name == REMOVE_INDENTATION_WARNING { |
| 462 | + matched_lines |
| 463 | + .iter() |
| 464 | + .filter(|l| !l.trim().is_empty()) |
| 465 | + .map(|l| get_removed_indentation(l)) |
| 466 | + .next() |
| 467 | + } else { |
| 468 | + None |
| 469 | + }; |
| 470 | + |
| 471 | + let mut warnings = HashSet::new(); |
| 472 | + if tolerance.severity == ToleranceSeverity::Warning && !tolerance.error_name.is_empty() { |
| 473 | + warnings.insert(tolerance.error_name.to_string()); |
| 474 | + } |
| 475 | + |
| 476 | + return vec![TieredMatchResult { |
| 477 | + result: MatchResult { |
| 478 | + matched_slice: (start, end), |
| 479 | + line_range: (start + 1, end), |
| 480 | + tolerances_hit: vec![ToleranceHit { |
| 481 | + tolerance_index: tier_idx, |
| 482 | + severity: tolerance.severity, |
| 483 | + score_multiplier: tolerance.score_multiplier, |
| 484 | + error_name: tolerance.error_name, |
| 485 | + count: n_search, |
| 486 | + }], |
| 487 | + score: tolerance.score_multiplier * n_search as f64, |
| 488 | + warnings, |
| 489 | + matched_lines, |
| 490 | + removed_indentation, |
| 491 | + }, |
| 492 | + tier, |
| 493 | + early_exit: true, |
| 494 | + }]; |
| 495 | + } |
| 496 | + |
| 497 | + // Multiple matches at this tier - continue to next tier for more specificity |
| 498 | + // (unless this is the last tier) |
| 499 | + if !matched_slices.is_empty() && tier == tolerances.len() { |
| 500 | + trace!("Tiered match: {} matches at final tier {}", matched_slices.len(), tier); |
| 501 | + return matched_slices |
| 502 | + .into_iter() |
| 503 | + .map(|(start, end)| { |
| 504 | + let matched_lines: Vec<String> = content_lines[start..end] |
| 505 | + .iter() |
| 506 | + .map(|s| (*s).to_string()) |
| 507 | + .collect(); |
| 508 | + |
| 509 | + let removed_indentation = if tolerance.error_name == REMOVE_INDENTATION_WARNING { |
| 510 | + matched_lines |
| 511 | + .iter() |
| 512 | + .filter(|l| !l.trim().is_empty()) |
| 513 | + .map(|l| get_removed_indentation(l)) |
| 514 | + .next() |
| 515 | + } else { |
| 516 | + None |
| 517 | + }; |
| 518 | + |
| 519 | + let mut warnings = HashSet::new(); |
| 520 | + if tolerance.severity == ToleranceSeverity::Warning |
| 521 | + && !tolerance.error_name.is_empty() |
| 522 | + { |
| 523 | + warnings.insert(tolerance.error_name.to_string()); |
| 524 | + } |
| 525 | + |
| 526 | + TieredMatchResult { |
| 527 | + result: MatchResult { |
| 528 | + matched_slice: (start, end), |
| 529 | + line_range: (start + 1, end), |
| 530 | + tolerances_hit: vec![ToleranceHit { |
| 531 | + tolerance_index: tier_idx, |
| 532 | + severity: tolerance.severity, |
| 533 | + score_multiplier: tolerance.score_multiplier, |
| 534 | + error_name: tolerance.error_name, |
| 535 | + count: n_search, |
| 536 | + }], |
| 537 | + score: tolerance.score_multiplier * n_search as f64, |
| 538 | + warnings, |
| 539 | + matched_lines, |
| 540 | + removed_indentation, |
| 541 | + }, |
| 542 | + tier, |
| 543 | + early_exit: false, |
| 544 | + } |
| 545 | + }) |
| 546 | + .collect(); |
| 547 | + } |
| 548 | + } |
| 549 | + |
| 550 | + // No matches found at any tier |
| 551 | + trace!("Tiered match: no matches found at any tier"); |
| 552 | + vec![] |
| 553 | +} |
| 554 | + |
304 | 555 | /// Match with tolerances applied |
305 | 556 | pub fn match_with_tolerance( |
306 | 557 | content_lines: &[&str], |
@@ -1089,4 +1340,120 @@ mod tests { |
1089 | 1340 | assert!(result.content.contains(" fn new_test()")); |
1090 | 1341 | assert!(result.content.contains(" new_code()")); |
1091 | 1342 | } |
| 1343 | + |
| 1344 | + // ======================================================================== |
| 1345 | + // Tiered Matching with Early Exit Tests |
| 1346 | + // ======================================================================== |
| 1347 | + |
| 1348 | + #[test] |
| 1349 | + fn test_tiered_exact_match_early_exit() { |
| 1350 | + // Exact match should return at Tier 0 with early_exit = true |
| 1351 | + let content = vec!["line1", "line2", "line3"]; |
| 1352 | + let search = vec!["line1", "line2"]; |
| 1353 | + |
| 1354 | + let results = match_tiered(&content, 0, &search); |
| 1355 | + |
| 1356 | + assert_eq!(results.len(), 1); |
| 1357 | + assert_eq!(results[0].tier, 0); // Tier 0 = exact match |
| 1358 | + assert!(results[0].early_exit); |
| 1359 | + assert_eq!(results[0].result.matched_slice, (0, 2)); |
| 1360 | + assert_eq!(results[0].result.score, 0.0); // No tolerance used |
| 1361 | + } |
| 1362 | + |
| 1363 | + #[test] |
| 1364 | + fn test_tiered_rstrip_early_exit() { |
| 1365 | + // Content has trailing whitespace - should match at Tier 1 (rstrip) |
| 1366 | + let content = vec!["line1 ", "line2 "]; |
| 1367 | + let search = vec!["line1", "line2"]; |
| 1368 | + |
| 1369 | + let results = match_tiered(&content, 0, &search); |
| 1370 | + |
| 1371 | + assert_eq!(results.len(), 1); |
| 1372 | + assert_eq!(results[0].tier, 1); // Tier 1 = rstrip |
| 1373 | + assert!(results[0].early_exit); |
| 1374 | + assert_eq!(results[0].result.score, 2.0); // 1.0 * 2 lines |
| 1375 | + } |
| 1376 | + |
| 1377 | + #[test] |
| 1378 | + fn test_tiered_lstrip_early_exit() { |
| 1379 | + // Content has different indentation - should match at Tier 2 (lstrip) |
| 1380 | + let content = vec![" fn foo() {", " bar()", " }"]; |
| 1381 | + let search = vec!["fn foo() {", " bar()", "}"]; |
| 1382 | + |
| 1383 | + let results = match_tiered(&content, 0, &search); |
| 1384 | + |
| 1385 | + assert_eq!(results.len(), 1); |
| 1386 | + assert_eq!(results[0].tier, 2); // Tier 2 = lstrip |
| 1387 | + assert!(results[0].early_exit); |
| 1388 | + assert!(results[0].result.warnings.contains(REMOVE_INDENTATION_WARNING)); |
| 1389 | + } |
| 1390 | + |
| 1391 | + #[test] |
| 1392 | + fn test_tiered_line_nums_early_exit() { |
| 1393 | + // Search has line numbers - should match at Tier 3 |
| 1394 | + let content = vec!["fn main() {", " println!(\"hello\");", "}"]; |
| 1395 | + let search = vec!["1 fn main() {", "2 println!(\"hello\");", "3 }"]; |
| 1396 | + |
| 1397 | + let results = match_tiered(&content, 0, &search); |
| 1398 | + |
| 1399 | + assert_eq!(results.len(), 1); |
| 1400 | + assert_eq!(results[0].tier, 3); // Tier 3 = remove line nums |
| 1401 | + assert!(results[0].early_exit); |
| 1402 | + assert!(results[0].result.warnings.contains(REMOVE_LINE_NUMS_WARNING)); |
| 1403 | + } |
| 1404 | + |
| 1405 | + #[test] |
| 1406 | + fn test_tiered_unicode_early_exit() { |
| 1407 | + // Search has unicode quotes - should match at Tier 4 |
| 1408 | + let content = vec!["let x = \"hello\";"]; |
| 1409 | + let search = vec!["let x = \u{201C}hello\u{201D};"]; // Smart quotes |
| 1410 | + |
| 1411 | + let results = match_tiered(&content, 0, &search); |
| 1412 | + |
| 1413 | + assert_eq!(results.len(), 1); |
| 1414 | + assert_eq!(results[0].tier, 4); // Tier 4 = normalize unicode |
| 1415 | + assert!(results[0].early_exit); |
| 1416 | + assert!(results[0].result.warnings.contains(NORMALIZE_CHARS_WARNING)); |
| 1417 | + } |
| 1418 | + |
| 1419 | + #[test] |
| 1420 | + fn test_tiered_multiple_exact_matches() { |
| 1421 | + // Multiple exact matches - should return all with early_exit = false |
| 1422 | + let content = vec!["x = 1", "y = 2", "x = 1"]; |
| 1423 | + let search = vec!["x = 1"]; |
| 1424 | + |
| 1425 | + let results = match_tiered(&content, 0, &search); |
| 1426 | + |
| 1427 | + assert_eq!(results.len(), 2); // Two matches |
| 1428 | + assert!(results.iter().all(|r| r.tier == 0)); // All at Tier 0 |
| 1429 | + assert!(results.iter().all(|r| !r.early_exit)); // No early exit (ambiguous) |
| 1430 | + } |
| 1431 | + |
| 1432 | + #[test] |
| 1433 | + fn test_tiered_no_match() { |
| 1434 | + // No match at any tier |
| 1435 | + let content = vec!["completely", "different", "content"]; |
| 1436 | + let search = vec!["not", "found"]; |
| 1437 | + |
| 1438 | + let results = match_tiered(&content, 0, &search); |
| 1439 | + |
| 1440 | + assert!(results.is_empty()); |
| 1441 | + } |
| 1442 | + |
| 1443 | + #[test] |
| 1444 | + fn test_tiered_performance_exact_match() { |
| 1445 | + // Large content with exact match should return immediately |
| 1446 | + let mut content: Vec<&str> = (0..1000).map(|_| "filler line").collect(); |
| 1447 | + content[500] = "target line 1"; |
| 1448 | + content[501] = "target line 2"; |
| 1449 | + |
| 1450 | + let search = vec!["target line 1", "target line 2"]; |
| 1451 | + |
| 1452 | + let results = match_tiered(&content, 0, &search); |
| 1453 | + |
| 1454 | + assert_eq!(results.len(), 1); |
| 1455 | + assert_eq!(results[0].tier, 0); // Found at Tier 0 |
| 1456 | + assert!(results[0].early_exit); |
| 1457 | + assert_eq!(results[0].result.matched_slice, (500, 502)); |
| 1458 | + } |
1092 | 1459 | } |
0 commit comments