Skip to content

Commit cff84ae

Browse files
committed
feat(fs_write): fuzzy str_replace with 3-strategy fallback chain
The current str_replace implementation uses exact byte matching only. When the model's old_str has minor differences from the file (indentation drift, whitespace, or small context edits), the match fails and the model either retries wastefully or falls back to destructive shell commands. Implement str_replace_fuzzy() with a 3-strategy fallback chain inspired by opencode and cline's diff-apply approaches: 1. Exact match — unchanged behaviour for the common case 2. Line-trimmed match — compares lines after trim(), then replaces using byte offsets (prefix-sum table) into the original content. Handles indentation drift. 3. Block-anchor match — uses first+last line as anchors, scores middle lines with Levenshtein similarity, picks the best candidate above a 0.6 threshold. Handles minor edits in surrounding context lines. Also: - validate() now rejects empty old_str before reaching fuzzy matching - tool_index.json description updated to reflect fuzzy tolerance and reinforce read-before-write / no-sed-fallback guidance - tool_index.json and fs_write.rs are now consistent (previously split) Key correctness properties: - Strategies 2 and 3 return byte ranges (start, end) — replacement is always at the correct position even if matched text appears elsewhere - block_anchor_match skips first==last anchors (false positive guard) - similarity_score respects actual content window bounds - levenshtein uses O(n) rolling-row space, char count for denominator - build_line_offsets prefix-sum gives O(1) offset lookup - strip_empty_boundary_lines handles both leading and trailing empty lines 11 tests cover: exact match, ambiguous rejection, empty old_str, indentation drift, minor middle-line edits, correct-position replacement when matched text appears elsewhere, symmetric anchor rejection, Levenshtein correctness, line-trimmed indentation preservation, and ambiguity rejection.
1 parent e14ea18 commit cff84ae

2 files changed

Lines changed: 362 additions & 72 deletions

File tree

crates/chat-cli/src/cli/chat/tools/fs_write.rs

Lines changed: 299 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ impl FsWrite {
133133
},
134134
FsWrite::StrReplace { old_str, new_str, .. } => {
135135
let file = os.fs.read_to_string(&path).await?;
136-
let matches = file.match_indices(old_str).collect::<Vec<_>>();
137136
queue!(
138137
output,
139138
style::Print("Updating: "),
@@ -142,14 +141,8 @@ impl FsWrite {
142141
StyledText::reset(),
143142
style::Print("\n"),
144143
)?;
145-
match matches.len() {
146-
0 => return Err(eyre!("no occurrences of \"{old_str}\" were found")),
147-
1 => {
148-
let file = file.replacen(old_str, new_str, 1);
149-
os.fs.write(&path, file).await?;
150-
},
151-
x => return Err(eyre!("{x} occurrences of old_str were found when only 1 is expected")),
152-
}
144+
let updated = str_replace_fuzzy(&file, old_str, new_str)?;
145+
os.fs.write(&path, updated).await?;
153146
},
154147
FsWrite::Insert {
155148
insert_line, new_str, ..
@@ -393,7 +386,16 @@ impl FsWrite {
393386
bail!("Path must not be empty")
394387
};
395388
},
396-
FsWrite::StrReplace { path, .. } | FsWrite::Insert { path, .. } => {
389+
FsWrite::StrReplace { path, old_str, .. } => {
390+
let path = sanitize_path_tool_arg(os, path);
391+
if !path.exists() {
392+
bail!("The provided path must exist in order to replace or insert contents into it")
393+
}
394+
if old_str.trim().is_empty() {
395+
bail!("old_str must not be empty — use fs_read to read the file first, then provide the exact text to replace")
396+
}
397+
},
398+
FsWrite::Insert { path, .. } => {
397399
let path = sanitize_path_tool_arg(os, path);
398400
if !path.exists() {
399401
bail!("The provided path must exist in order to replace or insert contents into it")
@@ -858,6 +860,189 @@ fn syntect_to_crossterm_color(syntect: syntect::highlighting::Color) -> style::C
858860
}
859861
}
860862

863+
/// Attempts to replace `old_str` with `new_str` in `content` using a fallback chain:
864+
///
865+
/// 1. **Exact match** — fastest, most precise.
866+
/// 2. **Line-trimmed match** — matches lines after stripping leading/trailing whitespace,
867+
/// then replaces the original (indented) text. Handles indentation drift.
868+
/// 3. **Block-anchor match** — matches by first+last line as anchors, uses Levenshtein
869+
/// similarity on middle lines to find the best candidate. Handles minor edits in context.
870+
///
871+
/// Returns an error if no strategy finds exactly one unambiguous match.
872+
fn str_replace_fuzzy(content: &str, old_str: &str, new_str: &str) -> eyre::Result<String> {
873+
// Strategy 1: exact match
874+
let exact_count = content.match_indices(old_str).count();
875+
match exact_count {
876+
1 => return Ok(content.replacen(old_str, new_str, 1)),
877+
x if x > 1 => {
878+
return Err(eyre::eyre!(
879+
"{x} occurrences of old_str were found when only 1 is expected — \
880+
add more surrounding context to old_str to make it unique"
881+
))
882+
},
883+
_ => {},
884+
}
885+
886+
// Strategies 2 & 3: fuzzy — both return a byte range to splice at
887+
let range = line_trimmed_match(content, old_str)
888+
.or_else(|| block_anchor_match(content, old_str));
889+
890+
if let Some((start, end)) = range {
891+
return Ok(format!("{}{}{}", &content[..start], new_str, &content[end..]));
892+
}
893+
894+
Err(eyre::eyre!(
895+
"no occurrences of the provided old_str were found (tried exact, \
896+
line-trimmed, and block-anchor matching) — use fs_read to read the \
897+
current file content and retry str_replace with the exact text. \
898+
Do NOT fall back to shell commands like sed."
899+
))
900+
}
901+
902+
/// Strips leading and trailing empty lines from a split-by-newline vec.
903+
fn strip_empty_boundary_lines(mut lines: Vec<&str>) -> Vec<&str> {
904+
while lines.last().map(|l: &&str| l.trim().is_empty()).unwrap_or(false) {
905+
lines.pop();
906+
}
907+
while lines.first().map(|l: &&str| l.trim().is_empty()).unwrap_or(false) {
908+
lines.remove(0);
909+
}
910+
lines
911+
}
912+
913+
/// Builds a prefix-sum table of byte offsets for lines split by `\n`.
914+
/// `offsets[i]` = byte offset of the start of line `i` in the original string.
915+
/// `offsets[lines.len()]` = one past the last byte (i.e. content.len() + 1 conceptually).
916+
fn build_line_offsets(lines: &[&str]) -> Vec<usize> {
917+
let mut offsets = Vec::with_capacity(lines.len() + 1);
918+
offsets.push(0usize);
919+
for line in lines {
920+
offsets.push(offsets.last().unwrap() + line.len() + 1); // +1 for '\n'
921+
}
922+
offsets
923+
}
924+
925+
/// Matches `find` against `content` by comparing trimmed lines.
926+
/// Returns the byte range `(start, end)` in `content` if exactly one match is found.
927+
fn line_trimmed_match(content: &str, find: &str) -> Option<(usize, usize)> {
928+
let content_lines: Vec<&str> = content.split('\n').collect();
929+
let search_lines = strip_empty_boundary_lines(find.split('\n').collect());
930+
931+
if search_lines.is_empty() {
932+
return None;
933+
}
934+
935+
let offsets = build_line_offsets(&content_lines);
936+
937+
let mut matches: Vec<(usize, usize)> = Vec::new();
938+
'outer: for i in 0..=content_lines.len().saturating_sub(search_lines.len()) {
939+
for (j, search_line) in search_lines.iter().enumerate() {
940+
if content_lines[i + j].trim() != search_line.trim() {
941+
continue 'outer;
942+
}
943+
}
944+
let start = offsets[i];
945+
let end = offsets[i + search_lines.len()].saturating_sub(1).min(content.len());
946+
matches.push((start, end));
947+
}
948+
949+
if matches.len() == 1 { Some(matches[0]) } else { None }
950+
}
951+
952+
/// Levenshtein distance between two strings (char-level, O(min(m,n)) space).
953+
/// `a` is placed in the row dimension (longer), `b` in the column (shorter).
954+
fn levenshtein(a: &str, b: &str) -> usize {
955+
let a: Vec<char> = a.chars().collect();
956+
let b: Vec<char> = b.chars().collect();
957+
// Ensure `a` is the longer string so `b` (columns) is the smaller allocation
958+
let (a, b) = if a.len() >= b.len() { (a, b) } else { (b, a) };
959+
let (m, n) = (a.len(), b.len());
960+
let mut prev: Vec<usize> = (0..=n).collect();
961+
let mut curr = vec![0usize; n + 1];
962+
for i in 1..=m {
963+
curr[0] = i;
964+
for j in 1..=n {
965+
curr[j] = if a[i - 1] == b[j - 1] {
966+
prev[j - 1]
967+
} else {
968+
1 + prev[j].min(curr[j - 1]).min(prev[j - 1])
969+
};
970+
}
971+
std::mem::swap(&mut prev, &mut curr);
972+
}
973+
prev[n]
974+
}
975+
976+
const SIMILARITY_THRESHOLD: f64 = 0.6;
977+
978+
/// Matches `find` against `content` using first+last line as anchors and Levenshtein
979+
/// similarity on middle lines. Returns the byte range `(start, end)` in `content` if
980+
/// similarity exceeds the threshold and the match is unambiguous.
981+
fn block_anchor_match(content: &str, find: &str) -> Option<(usize, usize)> {
982+
let content_lines: Vec<&str> = content.split('\n').collect();
983+
let search_lines = strip_empty_boundary_lines(find.split('\n').collect());
984+
985+
// Need at least 2 distinct lines for anchor matching
986+
if search_lines.len() < 2 {
987+
return None;
988+
}
989+
990+
let first = search_lines[0].trim();
991+
let last = search_lines[search_lines.len() - 1].trim();
992+
993+
// Symmetric anchors (e.g. `}` / `}`) produce too many false positives
994+
if first == last {
995+
return None;
996+
}
997+
998+
// Build offsets once — reused for both scoring and final byte range
999+
let offsets = build_line_offsets(&content_lines);
1000+
1001+
// Collect candidate windows where first and last anchor lines match
1002+
let mut candidates: Vec<(usize, usize, f64)> = Vec::new();
1003+
for i in 0..content_lines.len() {
1004+
if content_lines[i].trim() != first { continue; }
1005+
for j in (i + 1)..content_lines.len() {
1006+
if content_lines[j].trim() == last {
1007+
let score = similarity_score(&content_lines, i, j, &search_lines);
1008+
candidates.push((i, j, score));
1009+
break;
1010+
}
1011+
}
1012+
}
1013+
1014+
// Pick the single best candidate above the threshold
1015+
let best = candidates
1016+
.into_iter()
1017+
.filter(|&(_, _, s)| s >= SIMILARITY_THRESHOLD)
1018+
.max_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))?;
1019+
1020+
let start = offsets[best.0];
1021+
let end = offsets[best.1 + 1].saturating_sub(1).min(content.len());
1022+
Some((start, end))
1023+
}
1024+
1025+
/// Average Levenshtein similarity of middle lines between `search_lines` and the
1026+
/// corresponding window `content_lines[start..=end]`.
1027+
fn similarity_score(content_lines: &[&str], start: usize, end: usize, search_lines: &[&str]) -> f64 {
1028+
let middle_count = search_lines.len().saturating_sub(2);
1029+
if middle_count == 0 { return 1.0; }
1030+
1031+
let mut total = 0.0;
1032+
let mut counted = 0;
1033+
for k in 1..search_lines.len().saturating_sub(1) {
1034+
let ci = start + k;
1035+
if ci >= end { break; }
1036+
let a = content_lines[ci].trim();
1037+
let b = search_lines[k].trim();
1038+
let max_len = a.chars().count().max(b.chars().count());
1039+
if max_len == 0 { total += 1.0; counted += 1; continue; }
1040+
total += 1.0 - levenshtein(a, b) as f64 / max_len as f64;
1041+
counted += 1;
1042+
}
1043+
if counted == 0 { 1.0 } else { total / counted as f64 }
1044+
}
1045+
8611046
#[cfg(test)]
8621047
mod tests {
8631048
use std::collections::HashMap;
@@ -870,6 +1055,110 @@ mod tests {
8701055
setup_test_directory,
8711056
};
8721057

1058+
// ── str_replace_fuzzy tests ──────────────────────────────────────────────
1059+
1060+
#[test]
1061+
fn fuzzy_exact_match() {
1062+
let content = "fn foo() {\n let x = 1;\n}\n";
1063+
let result = str_replace_fuzzy(content, "let x = 1;", "let x = 42;").unwrap();
1064+
assert_eq!(result, "fn foo() {\n let x = 42;\n}\n");
1065+
}
1066+
1067+
#[test]
1068+
fn fuzzy_exact_match_fails_on_ambiguous() {
1069+
let content = "let x = 1;\nlet x = 1;\n";
1070+
assert!(str_replace_fuzzy(content, "let x = 1;", "let x = 2;").is_err());
1071+
}
1072+
1073+
#[test]
1074+
fn fuzzy_line_trimmed_handles_indentation_drift() {
1075+
// old_str has different indentation than the file
1076+
let content = "fn foo() {\n let x = 1;\n let y = 2;\n}\n";
1077+
let old_str = "let x = 1;\nlet y = 2;"; // no indentation
1078+
let result = str_replace_fuzzy(content, old_str, "let x = 10;\nlet y = 20;").unwrap();
1079+
assert!(result.contains("let x = 10;"));
1080+
assert!(result.contains("let y = 20;"));
1081+
}
1082+
1083+
#[test]
1084+
fn fuzzy_block_anchor_handles_minor_middle_edits() {
1085+
// Middle line has a minor typo vs what's in the file
1086+
let content = "fn calculate() {\n let result = a + b;\n return result;\n}\n";
1087+
// old_str has slightly different middle line
1088+
let old_str = "fn calculate() {\n let result = a + b; // sum\n return result;\n}";
1089+
let result = str_replace_fuzzy(content, old_str, "fn calculate() {\n return a + b;\n}");
1090+
// Should find a match via block anchor (first+last line match)
1091+
assert!(result.is_ok(), "block anchor should match: {:?}", result);
1092+
}
1093+
1094+
#[test]
1095+
fn fuzzy_rejects_empty_old_str() {
1096+
// empty old_str should be caught at validation, not reach fuzzy matching
1097+
let result = str_replace_fuzzy("fn foo() {}", "", "fn bar() {}");
1098+
assert!(result.is_err());
1099+
// str_replace_fuzzy itself: exact match on "" would match everywhere,
1100+
// so it should return an ambiguous error
1101+
let msg = result.unwrap_err().to_string();
1102+
assert!(msg.contains("occurrences"), "should report ambiguous match: {msg}");
1103+
}
1104+
1105+
#[test]
1106+
fn fuzzy_returns_error_when_no_strategy_matches() {
1107+
let content = "fn foo() {}\n";
1108+
let result = str_replace_fuzzy(content, "fn bar() {}", "fn baz() {}");
1109+
assert!(result.is_err());
1110+
let msg = result.unwrap_err().to_string();
1111+
assert!(msg.contains("fs_read"), "error should mention fs_read: {msg}");
1112+
assert!(msg.contains("sed"), "error should warn against sed: {msg}");
1113+
}
1114+
1115+
#[test]
1116+
fn fuzzy_replaces_correct_occurrence_when_matched_text_appears_elsewhere() {
1117+
// The fuzzy-matched substring also appears earlier in the file.
1118+
// We must replace the matched position, not the first occurrence.
1119+
let content = " let x = 1;\nfn foo() {\n let x = 1;\n let y = 2;\n}\n";
1120+
// old_str with no indentation — line-trimmed will match the block inside fn foo
1121+
let old_str = "let x = 1;\nlet y = 2;";
1122+
let result = str_replace_fuzzy(content, old_str, "let x = 10;\nlet y = 20;").unwrap();
1123+
// The standalone "let x = 1;" at the top must be untouched
1124+
assert!(result.starts_with(" let x = 1;\n"), "first occurrence must be untouched");
1125+
assert!(result.contains("let x = 10;"), "matched block must be replaced");
1126+
}
1127+
1128+
#[test]
1129+
fn block_anchor_skips_symmetric_first_last_lines() {
1130+
// first == last — should not produce false positive via block anchor
1131+
let content = "}\n}\n";
1132+
let find = "}\n}";
1133+
// block_anchor_match should return None because first == last
1134+
assert!(block_anchor_match(content, find).is_none());
1135+
}
1136+
1137+
#[test]
1138+
fn levenshtein_space_optimised_matches_naive() {
1139+
// Verify the O(n) space implementation gives correct results
1140+
assert_eq!(levenshtein("", "abc"), 3);
1141+
assert_eq!(levenshtein("abc", ""), 3);
1142+
assert_eq!(levenshtein("saturday", "sunday"), 3);
1143+
}
1144+
1145+
#[test]
1146+
fn line_trimmed_match_finds_indented_block() {
1147+
let content = "class Foo {\n void bar() {\n int x = 1;\n }\n}\n";
1148+
let find = "void bar() {\n int x = 1;\n}";
1149+
let matched = line_trimmed_match(content, find);
1150+
assert!(matched.is_some(), "should find indented block");
1151+
let (start, end) = matched.unwrap();
1152+
assert!(content[start..end].contains(" void bar()"), "should preserve original indentation");
1153+
}
1154+
1155+
#[test]
1156+
fn line_trimmed_match_returns_none_on_ambiguous() {
1157+
let content = " foo()\n foo()\n";
1158+
let find = "foo()";
1159+
assert!(line_trimmed_match(content, find).is_none());
1160+
}
1161+
8731162
#[test]
8741163
fn test_fs_write_deserialize() {
8751164
let path = "/my-file";

0 commit comments

Comments
 (0)