Skip to content

Commit 164f5fa

Browse files
committed
Add tests for negative lookahead and Unicode
These tests cover: - Parser: negative lookahead with nonterminals, terminals, charsets, grouped expressions, within sequences, repetitions, and alternations; error case for trailing `!`; Unicode code points with 4, 5, and 6 hex digits; charset ranges with `Character::Char`, `Character::Unicode`, and mixed forms; charsets combining named entries, terminals, and Unicode ranges. - Markdown renderer: negative lookahead rendering with `!`, Unicode rendering as `U+xxxx`, charset rendering with char and Unicode ranges, cut and neg expression rendering, and markdown escaping. - Railroad renderer: negative lookahead renders as a "not followed by" labeled box, Unicode renders as terminal, charset ranges, cut renders as "no backtracking" labeled box, and neg expression renders as "with the exception of" labeled box.
1 parent fc15897 commit 164f5fa

3 files changed

Lines changed: 624 additions & 2 deletions

File tree

tools/grammar/src/parser.rs

Lines changed: 330 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) {
573573
#[cfg(test)]
574574
mod tests {
575575
use crate::parser::{parse_grammar, translate_position};
576-
use crate::{ExpressionKind, Grammar, RangeLimit};
576+
use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit};
577577
use std::path::Path;
578578

579579
#[test]
@@ -778,4 +778,333 @@ mod tests {
778778
assert_eq!(max, Some(1));
779779
assert!(matches!(limit, RangeLimit::HalfOpen));
780780
}
781+
782+
// --- Negative lookahead tests ---
783+
784+
#[test]
785+
fn lookahead_simple_nonterminal() {
786+
let input = "Rule -> !Foo";
787+
let grammar = parse(input).unwrap();
788+
let rule = grammar.productions.get("Rule").unwrap();
789+
let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else {
790+
panic!("expected NegativeLookahead, got {:?}", rule.expression.kind);
791+
};
792+
assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "Foo"));
793+
}
794+
795+
#[test]
796+
fn lookahead_terminal() {
797+
let input = "Rule -> !`'` Foo";
798+
let grammar = parse(input).unwrap();
799+
let rule = grammar.productions.get("Rule").unwrap();
800+
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
801+
panic!("expected Sequence, got {:?}", rule.expression.kind);
802+
};
803+
assert_eq!(seq.len(), 2);
804+
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
805+
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
806+
};
807+
assert!(matches!(&inner.kind, ExpressionKind::Terminal(t) if t == "'"));
808+
assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "Foo"));
809+
}
810+
811+
#[test]
812+
fn lookahead_charset() {
813+
let input = "Rule -> ![`e` `E`] SUFFIX";
814+
let grammar = parse(input).unwrap();
815+
let rule = grammar.productions.get("Rule").unwrap();
816+
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
817+
panic!("expected Sequence, got {:?}", rule.expression.kind);
818+
};
819+
assert_eq!(seq.len(), 2);
820+
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
821+
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
822+
};
823+
let ExpressionKind::Charset(chars) = &inner.kind else {
824+
panic!("expected Charset inside lookahead, got {:?}", inner.kind);
825+
};
826+
assert_eq!(chars.len(), 2);
827+
assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e"));
828+
assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E"));
829+
}
830+
831+
#[test]
832+
fn lookahead_grouped() {
833+
let input = "Rule -> !(`.` | `_` | XID_Start)";
834+
let grammar = parse(input).unwrap();
835+
let rule = grammar.productions.get("Rule").unwrap();
836+
let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else {
837+
panic!("expected NegativeLookahead, got {:?}", rule.expression.kind);
838+
};
839+
let ExpressionKind::Grouped(grouped) = &inner.kind else {
840+
panic!("expected Grouped inside lookahead, got {:?}", inner.kind);
841+
};
842+
let ExpressionKind::Alt(alts) = &grouped.kind else {
843+
panic!("expected Alt inside Grouped, got {:?}", grouped.kind);
844+
};
845+
assert_eq!(alts.len(), 3);
846+
assert!(matches!(&alts[0].kind, ExpressionKind::Terminal(t) if t == "."));
847+
assert!(matches!(&alts[1].kind, ExpressionKind::Terminal(t) if t == "_"));
848+
assert!(matches!(&alts[2].kind, ExpressionKind::Nt(n) if n == "XID_Start"));
849+
}
850+
851+
#[test]
852+
fn lookahead_in_sequence_middle() {
853+
let input = "Rule -> A !B C";
854+
let grammar = parse(input).unwrap();
855+
let rule = grammar.productions.get("Rule").unwrap();
856+
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
857+
panic!("expected Sequence, got {:?}", rule.expression.kind);
858+
};
859+
assert_eq!(seq.len(), 3);
860+
assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A"));
861+
let ExpressionKind::NegativeLookahead(inner) = &seq[1].kind else {
862+
panic!("expected NegativeLookahead, got {:?}", seq[1].kind);
863+
};
864+
assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "B"));
865+
assert!(matches!(&seq[2].kind, ExpressionKind::Nt(n) if n == "C"));
866+
}
867+
868+
#[test]
869+
fn lookahead_in_repetition() {
870+
let input = "Rule -> (!A B)*";
871+
let grammar = parse(input).unwrap();
872+
let rule = grammar.productions.get("Rule").unwrap();
873+
let ExpressionKind::Repeat(rep) = &rule.expression.kind else {
874+
panic!("expected Repeat, got {:?}", rule.expression.kind);
875+
};
876+
let ExpressionKind::Grouped(grouped) = &rep.kind else {
877+
panic!("expected Grouped inside Repeat, got {:?}", rep.kind);
878+
};
879+
let ExpressionKind::Sequence(seq) = &grouped.kind else {
880+
panic!("expected Sequence inside Grouped, got {:?}", grouped.kind);
881+
};
882+
assert_eq!(seq.len(), 2);
883+
assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_)));
884+
assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B"));
885+
}
886+
887+
#[test]
888+
fn lookahead_in_alternation() {
889+
let input = "Rule -> !A B | C";
890+
let grammar = parse(input).unwrap();
891+
let rule = grammar.productions.get("Rule").unwrap();
892+
let ExpressionKind::Alt(alts) = &rule.expression.kind else {
893+
panic!("expected Alt, got {:?}", rule.expression.kind);
894+
};
895+
assert_eq!(alts.len(), 2);
896+
let ExpressionKind::Sequence(seq) = &alts[0].kind else {
897+
panic!("expected Sequence, got {:?}", alts[0].kind);
898+
};
899+
assert_eq!(seq.len(), 2);
900+
assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_)));
901+
assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B"));
902+
assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "C"));
903+
}
904+
905+
#[test]
906+
fn lookahead_fail_trailing() {
907+
let input = "Rule -> !";
908+
let err = parse(input).unwrap_err();
909+
assert!(err.contains("expected expression after !"));
910+
}
911+
912+
// --- Unicode tests ---
913+
914+
#[test]
915+
fn unicode_4_digit() {
916+
let input = "Rule -> U+0009";
917+
let grammar = parse(input).unwrap();
918+
let rule = grammar.productions.get("Rule").unwrap();
919+
let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
920+
panic!("expected Unicode, got {:?}", rule.expression.kind);
921+
};
922+
assert_eq!(*ch, '\t');
923+
assert_eq!(s, "0009");
924+
}
925+
926+
#[test]
927+
fn unicode_5_digit() {
928+
let input = "Rule -> U+E0000";
929+
let grammar = parse(input).unwrap();
930+
let rule = grammar.productions.get("Rule").unwrap();
931+
let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
932+
panic!("expected Unicode, got {:?}", rule.expression.kind);
933+
};
934+
assert_eq!(*ch, '\u{E0000}');
935+
assert_eq!(s, "E0000");
936+
}
937+
938+
#[test]
939+
fn unicode_6_digit() {
940+
let input = "Rule -> U+10FFFF";
941+
let grammar = parse(input).unwrap();
942+
let rule = grammar.productions.get("Rule").unwrap();
943+
let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
944+
panic!("expected Unicode, got {:?}", rule.expression.kind);
945+
};
946+
assert_eq!(*ch, '\u{10FFFF}');
947+
assert_eq!(s, "10FFFF");
948+
}
949+
950+
#[test]
951+
fn unicode_in_alternation() {
952+
let input = "Rule -> U+0009 | U+000A";
953+
let grammar = parse(input).unwrap();
954+
let rule = grammar.productions.get("Rule").unwrap();
955+
let ExpressionKind::Alt(alts) = &rule.expression.kind else {
956+
panic!("expected Alt, got {:?}", rule.expression.kind);
957+
};
958+
assert_eq!(alts.len(), 2);
959+
assert!(matches!(
960+
&alts[0].kind,
961+
ExpressionKind::Unicode((ch, _)) if *ch == '\t'
962+
));
963+
assert!(matches!(
964+
&alts[1].kind,
965+
ExpressionKind::Unicode((ch, _)) if *ch == '\n'
966+
));
967+
}
968+
969+
// --- Character / charset range tests ---
970+
971+
#[test]
972+
fn charset_unicode_range() {
973+
let input = "Rule -> [U+0000-U+007F]";
974+
let grammar = parse(input).unwrap();
975+
let rule = grammar.productions.get("Rule").unwrap();
976+
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
977+
panic!("expected Charset, got {:?}", rule.expression.kind);
978+
};
979+
assert_eq!(chars.len(), 1);
980+
let Characters::Range(a, b) = &chars[0] else {
981+
panic!("expected Range, got {:?}", chars[0]);
982+
};
983+
assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0'));
984+
assert!(matches!(
985+
b,
986+
Character::Unicode((ch, _)) if *ch == '\u{7F}'
987+
));
988+
}
989+
990+
#[test]
991+
fn charset_char_range() {
992+
let input = "Rule -> [`a`-`z`]";
993+
let grammar = parse(input).unwrap();
994+
let rule = grammar.productions.get("Rule").unwrap();
995+
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
996+
panic!("expected Charset, got {:?}", rule.expression.kind);
997+
};
998+
assert_eq!(chars.len(), 1);
999+
let Characters::Range(a, b) = &chars[0] else {
1000+
panic!("expected Range, got {:?}", chars[0]);
1001+
};
1002+
assert!(matches!(a, Character::Char(ch) if *ch == 'a'));
1003+
assert!(matches!(b, Character::Char(ch) if *ch == 'z'));
1004+
}
1005+
1006+
#[test]
1007+
fn charset_mixed_range() {
1008+
let input = "Rule -> [`a`-U+007A]";
1009+
let grammar = parse(input).unwrap();
1010+
let rule = grammar.productions.get("Rule").unwrap();
1011+
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
1012+
panic!("expected Charset, got {:?}", rule.expression.kind);
1013+
};
1014+
assert_eq!(chars.len(), 1);
1015+
let Characters::Range(a, b) = &chars[0] else {
1016+
panic!("expected Range, got {:?}", chars[0]);
1017+
};
1018+
assert!(matches!(a, Character::Char(ch) if *ch == 'a'));
1019+
assert!(matches!(
1020+
b,
1021+
Character::Unicode((ch, _)) if *ch == 'z'
1022+
));
1023+
}
1024+
1025+
#[test]
1026+
fn charset_multiple_unicode_ranges() {
1027+
let input = "Rule -> [U+0000-U+D7FF U+E000-U+10FFFF]";
1028+
let grammar = parse(input).unwrap();
1029+
let rule = grammar.productions.get("Rule").unwrap();
1030+
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
1031+
panic!("expected Charset, got {:?}", rule.expression.kind);
1032+
};
1033+
assert_eq!(chars.len(), 2);
1034+
let Characters::Range(a1, b1) = &chars[0] else {
1035+
panic!("expected Range, got {:?}", chars[0]);
1036+
};
1037+
assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0'));
1038+
assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}'));
1039+
let Characters::Range(a2, b2) = &chars[1] else {
1040+
panic!("expected Range, got {:?}", chars[1]);
1041+
};
1042+
assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}'));
1043+
assert!(matches!(b2, Character::Unicode((ch, _)) if *ch == '\u{10FFFF}'));
1044+
}
1045+
1046+
#[test]
1047+
fn charset_terminals_and_named() {
1048+
let input = "Rule -> [`a` `b` Foo]";
1049+
let grammar = parse(input).unwrap();
1050+
let rule = grammar.productions.get("Rule").unwrap();
1051+
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
1052+
panic!("expected Charset, got {:?}", rule.expression.kind);
1053+
};
1054+
assert_eq!(chars.len(), 3);
1055+
assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a"));
1056+
assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b"));
1057+
assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo"));
1058+
}
1059+
1060+
// --- Negative lookahead combined with charset ---
1061+
1062+
#[test]
1063+
fn lookahead_charset_with_named_and_terminals() {
1064+
// Pattern from tokens.md: ![`'` `\` LF CR TAB] ASCII
1065+
let input = "Rule -> ![`x` `y` LF] Foo";
1066+
let grammar = parse(input).unwrap();
1067+
let rule = grammar.productions.get("Rule").unwrap();
1068+
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
1069+
panic!("expected Sequence, got {:?}", rule.expression.kind);
1070+
};
1071+
assert_eq!(seq.len(), 2);
1072+
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
1073+
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
1074+
};
1075+
let ExpressionKind::Charset(chars) = &inner.kind else {
1076+
panic!("expected Charset, got {:?}", inner.kind);
1077+
};
1078+
assert_eq!(chars.len(), 3);
1079+
assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x"));
1080+
assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y"));
1081+
assert!(matches!(&chars[2], Characters::Named(n) if n == "LF"));
1082+
}
1083+
1084+
// --- Negative lookahead combined with Unicode ---
1085+
1086+
#[test]
1087+
fn lookahead_charset_with_unicode_range() {
1088+
let input = "Rule -> ![U+0000-U+007F] Foo";
1089+
let grammar = parse(input).unwrap();
1090+
let rule = grammar.productions.get("Rule").unwrap();
1091+
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
1092+
panic!("expected Sequence, got {:?}", rule.expression.kind);
1093+
};
1094+
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
1095+
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
1096+
};
1097+
let ExpressionKind::Charset(chars) = &inner.kind else {
1098+
panic!("expected Charset, got {:?}", inner.kind);
1099+
};
1100+
assert_eq!(chars.len(), 1);
1101+
let Characters::Range(a, b) = &chars[0] else {
1102+
panic!("expected Range, got {:?}", chars[0]);
1103+
};
1104+
assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0'));
1105+
assert!(matches!(
1106+
b,
1107+
Character::Unicode((ch, _)) if *ch == '\u{7F}'
1108+
));
1109+
}
7811110
}

0 commit comments

Comments
 (0)