Skip to content

Commit 9bcf237

Browse files
43 bug naive colon finder corrupts quoted keys containing colons (#46)
* test: add failing test for quoted key with colon in complex replace * fix: replace naive colon finder with query_exact prefix trim Use doc.query_exact(route) to locate the value's byte span, then compute the key prefix as source[start_byte..value_start].trim_end(). This eliminates string-searching for the separator colon, which corrupted quoted keys containing colons. Removes find_key_colon entirely. Closes #43 * test: add multiple-colons regression test for complex replace * fix: add upfront byte-boundary validation to all indexing sites Apply is_char_boundary + bounds checks before direct string indexing in parse_value, apply_insert_at, and apply_complex_replace (including the new query_exact value span). Returns recoverable errors instead of panicking on misaligned UTF-8 boundaries.
1 parent 379171a commit 9bcf237

2 files changed

Lines changed: 77 additions & 40 deletions

File tree

src/document.rs

Lines changed: 65 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ impl PyDocument {
9494
match self.inner.query_exact(&r) {
9595
Ok(Some(feature)) => {
9696
let span = feature.location.byte_span;
97+
// Note: span.0 <= span.1 is guaranteed by tree-sitter node
98+
// ranges, so we only check bounds and UTF-8 alignment.
99+
if span.1 > source.len()
100+
|| !source.is_char_boundary(span.0)
101+
|| !source.is_char_boundary(span.1)
102+
{
103+
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
104+
"Feature span is not valid in source",
105+
));
106+
}
97107
let raw = &source[span.0..span.1];
98108
// Calculate the column offset (in bytes) of the value
99109
// start relative to the beginning of its line, so we can
@@ -285,6 +295,10 @@ fn apply_insert_at(
285295
.ok_or_else(|| format!("insert_at: item at index {resolved} not found"))?;
286296

287297
let item_start = item_feature.location.byte_span.0;
298+
// Note: no reversed-span check needed; tree-sitter nodes guarantee start <= end.
299+
if item_start > source.len() || !source.is_char_boundary(item_start) {
300+
return Err("Feature span is not valid in source".to_string());
301+
}
288302
let line_start = source[..item_start]
289303
.rfind('\n')
290304
.map(|nl| nl + 1)
@@ -343,6 +357,14 @@ fn apply_complex_replace(
343357
.query_pretty(route)
344358
.map_err(|e| format!("Query failed: {e}"))?;
345359

360+
let span = feature.location.byte_span;
361+
// Note: span.0 <= span.1 is guaranteed by tree-sitter node ranges,
362+
// so we only check bounds and UTF-8 alignment.
363+
if span.1 > source.len() || !source.is_char_boundary(span.0) || !source.is_char_boundary(span.1)
364+
{
365+
return Err("Feature span is not valid in source".to_string());
366+
}
367+
346368
let content_with_ws = doc.extract_with_leading_whitespace(&feature);
347369
let content = doc.extract(&feature);
348370

@@ -353,39 +375,52 @@ fn apply_complex_replace(
353375
let start_byte = feature.location.byte_span.0 - ws_len;
354376
let end_byte = feature.location.byte_span.1;
355377

356-
// Find the colon separating key from value
357-
let colon_pos = find_key_colon(content_with_ws);
378+
// Use query_exact to locate the value's byte span independently.
379+
// This avoids string-searching for the colon separator, which breaks
380+
// on quoted keys containing colons (e.g. "http://example.com": 8080).
381+
let value_feature = doc
382+
.query_exact(route)
383+
.map_err(|e| format!("Query failed: {e}"))?;
358384

359-
let key_part = match colon_pos {
360-
Some(pos) => {
361-
let key = &content_with_ws[..pos + 1]; // through the colon
362-
key.to_string()
385+
let key_part = match value_feature {
386+
Some(vf) => {
387+
let vf_start = vf.location.byte_span.0;
388+
// Note: no reversed-span check needed; tree-sitter nodes guarantee start <= end.
389+
if vf_start > source.len() || !source.is_char_boundary(vf_start) {
390+
return Err("Value feature span is not valid in source".to_string());
391+
}
392+
let prefix = source[start_byte..vf_start].trim_end();
393+
if prefix.is_empty() {
394+
// Bare value (e.g. sequence item) — no key prefix
395+
let serialized = serde_yaml::to_string(value)
396+
.map_err(|e| format!("Failed to serialize YAML: {e}"))?;
397+
let trimmed = serialized.trim_end_matches('\n');
398+
399+
let line_start = source[..feature.location.byte_span.0]
400+
.rfind('\n')
401+
.map(|nl| nl + 1)
402+
.unwrap_or(0);
403+
let base_indent = feature.location.byte_span.0 - line_start;
404+
let indent_str = " ".repeat(base_indent);
405+
406+
let indented = indent_block(trimmed, &indent_str);
407+
408+
let mut result = source.to_string();
409+
result.replace_range(
410+
feature.location.byte_span.0..feature.location.byte_span.1,
411+
&indented,
412+
);
413+
if !result.ends_with('\n') {
414+
result.push('\n');
415+
}
416+
return yamlpath::Document::new(result)
417+
.map_err(|e| format!("Failed to re-parse YAML: {e}"));
418+
}
419+
prefix.to_string()
363420
}
364421
None => {
365-
// No colon found — bare value (e.g. sequence item)
366-
let serialized = serde_yaml::to_string(value)
367-
.map_err(|e| format!("Failed to serialize YAML: {e}"))?;
368-
let trimmed = serialized.trim_end_matches('\n');
369-
370-
let line_start = source[..feature.location.byte_span.0]
371-
.rfind('\n')
372-
.map(|nl| nl + 1)
373-
.unwrap_or(0);
374-
let base_indent = feature.location.byte_span.0 - line_start;
375-
let indent_str = " ".repeat(base_indent);
376-
377-
let indented = indent_block(trimmed, &indent_str);
378-
379-
let mut result = source.to_string();
380-
result.replace_range(
381-
feature.location.byte_span.0..feature.location.byte_span.1,
382-
&indented,
383-
);
384-
if !result.ends_with('\n') {
385-
result.push('\n');
386-
}
387-
return yamlpath::Document::new(result)
388-
.map_err(|e| format!("Failed to re-parse YAML: {e}"));
422+
// Absent value (e.g. `key:\n`) — content is just key+colon
423+
content_with_ws.trim_end().to_string()
389424
}
390425
};
391426

@@ -424,16 +459,6 @@ fn apply_complex_replace(
424459
yamlpath::Document::new(result).map_err(|e| format!("Failed to re-parse YAML: {e}"))
425460
}
426461

427-
/// Find the first colon (key-value separator) in a YAML fragment.
428-
///
429-
/// Uses a naive `find(':')`, consistent with yamlpatch's own Replace
430-
/// implementation. This means colons inside quoted keys will be
431-
/// misidentified — a known yamlpatch limitation that will be fixed
432-
/// uniformly when yamlpatch addresses it.
433-
fn find_key_colon(content: &str) -> Option<usize> {
434-
content.find(':')
435-
}
436-
437462
fn indent_block(content: &str, indent: &str) -> String {
438463
let mut result = String::new();
439464
for (i, line) in content.lines().enumerate() {

tests/test_document.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,18 @@ def test_replace_flow_mapping_with_dict(self):
349349
doc2 = doc.replace("config", value={"x": 10})
350350
assert doc2["config"] == {"x": 10}
351351

352+
def test_replace_quoted_key_with_colon_complex_value(self):
353+
"""Regression: colon inside quoted key must not corrupt complex replace."""
354+
doc = Document('"http://example.com": 8080\n')
355+
doc2 = doc.replace("http://example.com", value={"port": 9090})
356+
assert doc2["http://example.com"] == {"port": 9090}
357+
358+
def test_replace_quoted_key_with_multiple_colons_complex_value(self):
359+
"""Regression: multiple colons inside quoted key must not corrupt complex replace."""
360+
doc = Document('"a:b:c": val\n')
361+
doc2 = doc.replace("a:b:c", value={"x": 1})
362+
assert doc2["a:b:c"] == {"x": 1}
363+
352364
def test_replace_key_with_hash_in_value(self):
353365
doc = Document("color: '#ff0000'\n")
354366
doc2 = doc.replace("color", value={"r": 255, "g": 0, "b": 0})

0 commit comments

Comments
 (0)