Stop trimming escaped spaces off the end regex

ratmice · ratmice · commit 124f613aa88d · 2026-05-02T17:42:01.000-07:00
diff --git a/lrlex/Cargo.toml b/lrlex/Cargo.toml
@@ -38,3 +38,4 @@ serde = { workspace = true, optional = true }
 prettyplease.workspace = true
 syn.workspace = true
 glob.workspace = true
+unicode-width.workspace = true
diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs
@@ -490,7 +490,7 @@ where
 
         if !dupe {
             let (start_states, re_str) =
-                self.parse_start_states(i, line[..rspace].trim_end_matches(matches_whitespace))?;
+                self.parse_start_states(i, trim_end_unescaped(&line[..rspace]))?;
             let rules_len = self.rules.len();
             let tok_id = LexerTypesT::StorageT::try_from(rules_len)
                            .unwrap_or_else(|_| panic!("StorageT::try_from \
@@ -685,6 +685,43 @@ where
     }
 }
 
+fn trim_end_unescaped(s: &str) -> &str {
+    use unicode_width::UnicodeWidthChar;
+    let mut cbuf = [0; 4];
+    let mut initial_space_bytes = 0;
+    let mut last_char_width = 0;
+    // First loop over spaces
+    for ch in s.chars().rev().into_iter() {
+        if RE_SPACE_SEP.is_match(ch.encode_utf8(&mut cbuf)) {
+            last_char_width = ch.width().unwrap_or(0);
+            initial_space_bytes += last_char_width;
+        } else {
+            break;
+        }
+    }
+    if initial_space_bytes == 0 {
+        return s;
+    }
+    let mut preceeding_backslashes = 0;
+    // Next loop over escaped slashes or spaces,
+    // an even number of backslashes are all escaping slashes,
+    // and an odd number of backslashes must have a trailing escaped space.
+    for c in s[..s.len() - initial_space_bytes].chars().rev() {
+        if c == '\\' {
+            preceeding_backslashes += 1
+        } else {
+            break;
+        }
+    }
+    // The backslash count was odd, the last must escape a space.
+    // Drop one of the intial spaces from the trim.
+    if preceeding_backslashes % 2 == 1 {
+        initial_space_bytes -= last_char_width;
+    }
+
+    &s[..s.len() - initial_space_bytes]
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -1826,4 +1863,37 @@ b "A"
             18,
         );
     }
+
+    #[test]
+    fn unescaped_trim() {
+        let escapes = [
+            (r#"\ "#, r#"\ "#),
+            (r#"\  "#, r#"\ "#),
+            (r#"\\ "#, r#"\\"#),
+            (r#"\\  "#, r#"\\"#),
+            (r#"\\\ "#, r#"\\\ "#),
+            (r#"\\\  "#, r#"\\\ "#),
+            (r#"\\\\ "#, r#"\\\\"#),
+            (r#"\\\\  "#, r#"\\\\"#),
+            (r#"x"#, r#"x"#),
+            (r#"x\ "#, r#"x\ "#),
+            (r#"x\  "#, r#"x\ "#),
+            (r#"x\\ "#, r#"x\\"#),
+            (r#"x\\  "#, r#"x\\"#),
+            (r#"x\\\ "#, r#"x\\\ "#),
+            (r#"x\\\  "#, r#"x\\\ "#),
+            (r#"x\\\\ "#, r#"x\\\\"#),
+            (r#"x\\\\  "#, r#"x\\\\"#),
+            (r#"x\ y "#, r#"x\ y"#),
+            (r#"x\ y\ "#, r#"x\ y\ "#),
+            (r#"x\ y\\ "#, r#"x\ y\\"#),
+            (r#"x\ y  "#, r#"x\ y"#),
+            (r#"x\ y\  "#, r#"x\ y\ "#),
+            (r#"x\ y\\  "#, r#"x\ y\\"#),
+        ];
+        for (escaped, expected) in escapes {
+            let trimmed = trim_end_unescaped(escaped);
+            assert_eq!(expected, trimmed)
+        }
+    }
 }
diff --git a/lrpar/cttests/src/regex_trailing_ws.test b/lrpar/cttests/src/regex_trailing_ws.test
@@ -0,0 +1,19 @@
+name: Test with regex containing trailing ws
+grammar: |
+    %grmtools {
+        yacckind: Original(YaccOriginalActionKind::NoAction),
+        recoverer: RecoveryKind::None,
+        test_files: ["*.input_trailing_ws"],
+    }
+    %start Expr
+    %%
+    Expr: "trailing";
+
+lexer: |
+    %%
+    [a-zA-Z]\  "trailing"
+    [\n\t] ;
+
+extra_files:
+  input1.input_trailing_ws: |
+    a