Skip to content

Commit 124f613

Browse files
committed
Stop trimming escaped spaces off the end regex
1 parent 12067d6 commit 124f613

3 files changed

Lines changed: 91 additions & 1 deletion

File tree

lrlex/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,4 @@ serde = { workspace = true, optional = true }
3838
prettyplease.workspace = true
3939
syn.workspace = true
4040
glob.workspace = true
41+
unicode-width.workspace = true

lrlex/src/lib/parser.rs

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ where
490490

491491
if !dupe {
492492
let (start_states, re_str) =
493-
self.parse_start_states(i, line[..rspace].trim_end_matches(matches_whitespace))?;
493+
self.parse_start_states(i, trim_end_unescaped(&line[..rspace]))?;
494494
let rules_len = self.rules.len();
495495
let tok_id = LexerTypesT::StorageT::try_from(rules_len)
496496
.unwrap_or_else(|_| panic!("StorageT::try_from \
@@ -685,6 +685,43 @@ where
685685
}
686686
}
687687

688+
fn trim_end_unescaped(s: &str) -> &str {
689+
use unicode_width::UnicodeWidthChar;
690+
let mut cbuf = [0; 4];
691+
let mut initial_space_bytes = 0;
692+
let mut last_char_width = 0;
693+
// First loop over spaces
694+
for ch in s.chars().rev().into_iter() {
695+
if RE_SPACE_SEP.is_match(ch.encode_utf8(&mut cbuf)) {
696+
last_char_width = ch.width().unwrap_or(0);
697+
initial_space_bytes += last_char_width;
698+
} else {
699+
break;
700+
}
701+
}
702+
if initial_space_bytes == 0 {
703+
return s;
704+
}
705+
let mut preceeding_backslashes = 0;
706+
// Next loop over escaped slashes or spaces,
707+
// an even number of backslashes are all escaping slashes,
708+
// and an odd number of backslashes must have a trailing escaped space.
709+
for c in s[..s.len() - initial_space_bytes].chars().rev() {
710+
if c == '\\' {
711+
preceeding_backslashes += 1
712+
} else {
713+
break;
714+
}
715+
}
716+
// The backslash count was odd, the last must escape a space.
717+
// Drop one of the intial spaces from the trim.
718+
if preceeding_backslashes % 2 == 1 {
719+
initial_space_bytes -= last_char_width;
720+
}
721+
722+
&s[..s.len() - initial_space_bytes]
723+
}
724+
688725
#[cfg(test)]
689726
mod test {
690727
use super::*;
@@ -1826,4 +1863,37 @@ b "A"
18261863
18,
18271864
);
18281865
}
1866+
1867+
#[test]
1868+
fn unescaped_trim() {
1869+
let escapes = [
1870+
(r#"\ "#, r#"\ "#),
1871+
(r#"\ "#, r#"\ "#),
1872+
(r#"\\ "#, r#"\\"#),
1873+
(r#"\\ "#, r#"\\"#),
1874+
(r#"\\\ "#, r#"\\\ "#),
1875+
(r#"\\\ "#, r#"\\\ "#),
1876+
(r#"\\\\ "#, r#"\\\\"#),
1877+
(r#"\\\\ "#, r#"\\\\"#),
1878+
(r#"x"#, r#"x"#),
1879+
(r#"x\ "#, r#"x\ "#),
1880+
(r#"x\ "#, r#"x\ "#),
1881+
(r#"x\\ "#, r#"x\\"#),
1882+
(r#"x\\ "#, r#"x\\"#),
1883+
(r#"x\\\ "#, r#"x\\\ "#),
1884+
(r#"x\\\ "#, r#"x\\\ "#),
1885+
(r#"x\\\\ "#, r#"x\\\\"#),
1886+
(r#"x\\\\ "#, r#"x\\\\"#),
1887+
(r#"x\ y "#, r#"x\ y"#),
1888+
(r#"x\ y\ "#, r#"x\ y\ "#),
1889+
(r#"x\ y\\ "#, r#"x\ y\\"#),
1890+
(r#"x\ y "#, r#"x\ y"#),
1891+
(r#"x\ y\ "#, r#"x\ y\ "#),
1892+
(r#"x\ y\\ "#, r#"x\ y\\"#),
1893+
];
1894+
for (escaped, expected) in escapes {
1895+
let trimmed = trim_end_unescaped(escaped);
1896+
assert_eq!(expected, trimmed)
1897+
}
1898+
}
18291899
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: Test with regex containing trailing ws
2+
grammar: |
3+
%grmtools {
4+
yacckind: Original(YaccOriginalActionKind::NoAction),
5+
recoverer: RecoveryKind::None,
6+
test_files: ["*.input_trailing_ws"],
7+
}
8+
%start Expr
9+
%%
10+
Expr: "trailing";
11+
12+
lexer: |
13+
%%
14+
[a-zA-Z]\ "trailing"
15+
[\n\t] ;
16+
17+
extra_files:
18+
input1.input_trailing_ws: |
19+
a

0 commit comments

Comments
 (0)