Skip to content

Commit f30f2d3

Browse files
committed
Handle non-ascii chars in query context summary
1 parent 7b8391e commit f30f2d3

1 file changed

Lines changed: 58 additions & 16 deletions

File tree

native/spark-expr/src/query_context.rs

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ impl QueryContext {
8080
}
8181
}
8282

83+
/// Convert a character index to a byte offset in the SQL text.
84+
/// Returns None if the character index is out of range.
85+
fn char_index_to_byte_offset(&self, char_index: usize) -> Option<usize> {
86+
self.sql_text
87+
.char_indices()
88+
.nth(char_index)
89+
.map(|(byte_offset, _)| byte_offset)
90+
}
91+
8392
/// Generate a summary string showing SQL fragment with error location.
8493
/// (From SQLQueryContext.summary)
8594
///
@@ -90,14 +99,23 @@ impl QueryContext {
9099
/// ^^^
91100
/// ```
92101
pub fn format_summary(&self) -> String {
93-
let start_idx = self.start_index.max(0) as usize;
94-
let stop_idx = (self.stop_index + 1).max(0) as usize;
95-
96-
// Extract the problematic fragment
97-
let fragment = if start_idx < self.sql_text.len() && stop_idx <= self.sql_text.len() {
98-
&self.sql_text[start_idx..stop_idx]
99-
} else {
100-
""
102+
let start_char = self.start_index.max(0) as usize;
103+
// stop_index is inclusive; fragment covers [start, stop]
104+
let stop_char = (self.stop_index + 1).max(0) as usize;
105+
106+
let fragment = match (
107+
self.char_index_to_byte_offset(start_char),
108+
// stop_char may equal sql_text.chars().count() (one past the end)
109+
self.char_index_to_byte_offset(stop_char).or_else(|| {
110+
if stop_char == self.sql_text.chars().count() {
111+
Some(self.sql_text.len())
112+
} else {
113+
None
114+
}
115+
}),
116+
) {
117+
(Some(start_byte), Some(stop_byte)) => &self.sql_text[start_byte..stop_byte],
118+
_ => "",
101119
};
102120

103121
// Build the header line
@@ -130,20 +148,29 @@ impl QueryContext {
130148
// Add caret pointer
131149
let caret_position = self.start_position.max(0) as usize;
132150
summary.push_str(&" ".repeat(caret_position));
133-
summary.push_str(&"^".repeat(fragment.len().max(1)));
151+
// fragment.chars().count() gives the correct display width for non-ASCII
152+
summary.push_str(&"^".repeat(fragment.chars().count().max(1)));
134153

135154
summary
136155
}
137156

138157
/// Returns the SQL fragment that caused the error.
139158
pub fn fragment(&self) -> String {
140-
let start_idx = self.start_index.max(0) as usize;
141-
let stop_idx = (self.stop_index + 1).max(0) as usize;
142-
143-
if start_idx < self.sql_text.len() && stop_idx <= self.sql_text.len() {
144-
self.sql_text[start_idx..stop_idx].to_string()
145-
} else {
146-
String::new()
159+
let start_char = self.start_index.max(0) as usize;
160+
let stop_char = (self.stop_index + 1).max(0) as usize;
161+
162+
match (
163+
self.char_index_to_byte_offset(start_char),
164+
self.char_index_to_byte_offset(stop_char).or_else(|| {
165+
if stop_char == self.sql_text.chars().count() {
166+
Some(self.sql_text.len())
167+
} else {
168+
None
169+
}
170+
}),
171+
) {
172+
(Some(start_byte), Some(stop_byte)) => self.sql_text[start_byte..stop_byte].to_string(),
173+
_ => String::new(),
147174
}
148175
}
149176
}
@@ -357,4 +384,19 @@ mod tests {
357384
assert_eq!(map.len(), 0);
358385
assert!(map.is_empty());
359386
}
387+
388+
// Verify that fragment() and format_summary() correctly handle SQL text that
389+
// contains multi-byte characters
390+
391+
#[test]
392+
fn test_fragment_non_ascii_accented() {
393+
// "é" is a 2-byte UTF-8 sequence (U+00E9).
394+
// SQL: "SELECT café FROM t"
395+
// 0123456789...
396+
// char indices: c=7, a=8, f=9, é=10, ' '=11 ... FROM = 12..
397+
// start_index=7, stop_index=10 should yield "café"
398+
let sql = "SELECT café FROM t".to_string();
399+
let ctx = QueryContext::new(sql, 7, 10, None, None, 1, 7);
400+
assert_eq!(ctx.fragment(), "café");
401+
}
360402
}

0 commit comments

Comments
 (0)