diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs index 4c258140e..cd4aec811 100644 --- a/lrlex/src/lib/ctbuilder.rs +++ b/lrlex/src/lib/ctbuilder.rs @@ -15,7 +15,11 @@ use std::{ }; use bincode::Encode; -use cfgrammar::{newlinecache::NewlineCache, Spanned}; +use cfgrammar::{ + header::{GrmtoolsSectionParser, HeaderError, HeaderErrorKind, Namespaced, Setting, Value}, + newlinecache::NewlineCache, + Spanned, +}; use lazy_static::lazy_static; use lrpar::{CTParserBuilder, LexerTypes}; use num_traits::{AsPrimitive, PrimInt, Unsigned}; @@ -37,6 +41,69 @@ pub enum LexerKind { LRNonStreamingLexer, } +impl TryFrom<&Value> for LexerKind { + type Error = cfgrammar::header::HeaderError; + fn try_from(it: &Value) -> Result { + match it { + Value::Flag(_, loc) => Err(HeaderError { + kind: HeaderErrorKind::ConversionError( + "LexerKind", + "Expected `LexerKind` found bool", + ), + locations: vec![loc.clone()], + }), + Value::Setting(Setting::Num(_, loc)) => Err(HeaderError { + kind: HeaderErrorKind::ConversionError( + "LexerKind", + "Expected `LexerKind` found numeric", + ), + locations: vec![loc.clone()], + }), + Value::Setting(Setting::Constructor { + ctor: + Namespaced { + namespace: _, + member: (_, loc), + }, + arg: _, + }) => Err(HeaderError { + kind: HeaderErrorKind::ConversionError( + "LexerKind", + "Expected `LexerKind` found constructor", + ), + locations: vec![loc.clone()], + }), + Value::Setting(Setting::Unitary(Namespaced { + namespace, + member: (member, member_loc), + })) => { + if let Some((ns, loc)) = namespace { + if ns.to_lowercase() != "lexerkind" { + return Err(HeaderError { + kind: HeaderErrorKind::ConversionError( + "LexerKind", + "Expected namespace `LexerKind`", + ), + locations: vec![loc.clone()], + }); + } + } + if member.to_lowercase() != "lrnonstreaminglexer" { + return Err(HeaderError { + kind: HeaderErrorKind::ConversionError( + "LexerKind", + "Unknown `LexerKind` Variant", + ), + locations: vec![member_loc.clone()], + }); + } + + Ok(LexerKind::LRNonStreamingLexer) + } + } + } +} + /// Specify the visibility of the module generated by [CTLexerBuilder]. #[derive(Clone, PartialEq, Eq, Debug)] #[non_exhaustive] @@ -129,7 +196,7 @@ where lrpar_config: Option) -> CTParserBuilder>>, lexer_path: Option, output_path: Option, - lexerkind: LexerKind, + lexerkind: Option, mod_name: Option<&'a str>, visibility: Visibility, rust_edition: RustEdition, @@ -138,6 +205,8 @@ where allow_missing_tokens_in_parser: bool, force_lex_flags: LexFlags, default_lex_flags: LexFlags, + #[cfg(test)] + inspect_lexerkind_cb: Option Result<(), Box>>>, } impl CTLexerBuilder<'_, DefaultLexerTypes> { @@ -174,7 +243,7 @@ where lrpar_config: None, lexer_path: None, output_path: None, - lexerkind: LexerKind::LRNonStreamingLexer, + lexerkind: None, mod_name: None, visibility: Visibility::Private, rust_edition: RustEdition::Rust2021, @@ -183,6 +252,8 @@ where allow_missing_tokens_in_parser: true, force_lex_flags: UNSPECIFIED_LEX_FLAGS, default_lex_flags: UNSPECIFIED_LEX_FLAGS, + #[cfg(test)] + inspect_lexerkind_cb: None, } } @@ -287,7 +358,7 @@ where /// Set the type of lexer to be generated to `lexerkind`. pub fn lexerkind(mut self, lexerkind: LexerKind) -> Self { - self.lexerkind = lexerkind; + self.lexerkind = Some(lexerkind); self } @@ -367,12 +438,32 @@ where } lk.insert(outp.clone()); } - let lex_src = read_to_string(lexerp) .map_err(|e| format!("When reading '{}': {e}", lexerp.display()))?; + let (header, _) = GrmtoolsSectionParser::new(&lex_src, false) + .parse() + .map_err(|es| { + es.iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n") + })?; + let lexerkind = match self.lexerkind { + Some(lexerkind) => lexerkind, + None => { + if let Some((_, lk_val)) = header.get("lexerkind") { + LexerKind::try_from(lk_val)? + } else { + LexerKind::LRNonStreamingLexer + } + } + }; let line_cache = NewlineCache::from_str(&lex_src).unwrap(); - let (mut lexerdef, lex_flags): (Box>, LexFlags) = match self - .lexerkind + #[cfg(test)] + if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb { + inspect_lexerkind_cb(lexerkind)? + } + let (mut lexerdef, lex_flags): (Box>, LexFlags) = match lexerkind { LexerKind::LRNonStreamingLexer => { let lexerdef = LRNonStreamingLexerDef::::new_with_options( @@ -529,7 +620,7 @@ where let rules = vec![#(#rules),*]; }); } - let lexerdef_ty = match self.lexerkind { + let lexerdef_ty = match lexerkind { LexerKind::LRNonStreamingLexer => { quote!(::lrlex::LRNonStreamingLexerDef) } @@ -801,6 +892,15 @@ where self.default_lex_flags = flags; self } + + #[cfg(test)] + pub fn inspect_lexerkind( + mut self, + cb: Box Result<(), Box>>, + ) -> Self { + self.inspect_lexerkind_cb = Some(cb); + self + } } /// An interface to the result of [CTLexerBuilder::build()]. @@ -897,3 +997,46 @@ pub fn ct_token_map( f.write_all(outs.as_bytes())?; Ok(()) } + +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::Write; + + use super::{CTLexerBuilder, LexerKind}; + #[test] + fn test_grmtools_section_lexerkind() { + let lexerkinds = [ + "LRNonStreamingLexer", + "lrnonstreaminglexer", + "LexerKind::lrnonstreaminglexer", + "lexerkind::LRNonStreamingLexer", + ]; + for (i, kind) in lexerkinds.iter().enumerate() { + let lex_src = format!( + " +%grmtools{{lexerkind: {}}} +%% +. ; +", + kind + ); + let lex_path = format!( + "{}/test_grmtools_section_lexerkind_{}.l", + env!("OUT_DIR"), + i + ); + let mut l_file = File::create(lex_path.clone()).unwrap(); + l_file.write_all(lex_src.as_bytes()).unwrap(); + CTLexerBuilder::new() + .output_path(format!("{}.rs", lex_path.clone())) + .lexer_path(lex_path.clone()) + .inspect_lexerkind(Box::new(move |lexerkind| { + assert!(matches!(lexerkind, LexerKind::LRNonStreamingLexer)); + Ok(()) + })) + .build() + .unwrap(); + } + } +} diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs index 4860a8cee..cd4b7c0c3 100644 --- a/lrlex/src/lib/parser.rs +++ b/lrlex/src/lib/parser.rs @@ -34,6 +34,7 @@ lazy_static! { static ref RE_LEADING_WS: Regex = Regex::new(r"^[\p{Pattern_White_Space}]*").unwrap(); static ref RE_WS: Regex = Regex::new(r"\p{Pattern_White_Space}").unwrap(); static ref RE_LEADING_DIGITS: Regex = Regex::new(r"^[0-9]+").unwrap(); + static ref RE_NAME: Regex = Regex::new(r"^[a-zA-Z][a-zA-Z]*").unwrap(); } const INITIAL_START_STATE_NAME: &str = "INITIAL"; @@ -252,7 +253,7 @@ where span_map: &mut HashMap<&str, Span>, lex_flags: &mut LexFlags, ) -> LexInternalBuildResult { - const OPTIONS: [&str; 12] = [ + const OPTIONS: [&str; 13] = [ "allow_wholeline_comments", "dot_matches_new_line", "multi_line", @@ -265,6 +266,7 @@ where "size_limit", "dfa_size_limit", "nest_limit", + "lexerkind", ]; let start_pos = i; // RegexBuilder isn't uniform regarding whether the default value of an options is true @@ -331,6 +333,26 @@ where } i = j; } + "lexerkind" => { + // We just want to skip this, we already know we're `LRNonStreamingLexer` + i = self.parse_ws(i)?; + if let Some(j) = self.lookahead_is(":", i) { + i = j + } else { + return Err(LexBuildError { + kind: LexErrorKind::InvalidGrmtoolsSectionValue, + spans: vec![Span::new(i, i)], + }); + } + i = self.parse_ws(i)?; + let (j, _) = self.parse_name(i)?; + i = self.parse_ws(j)?; + if let Some(j) = self.lookahead_is("::", j) { + i = self.parse_ws(j)?; + let (j, _) = self.parse_name(i)?; + i = j; + } + } _ => unreachable!(), } span_map.insert(opt, Span::new(start_pos, end_pos)); @@ -809,6 +831,19 @@ where }) } + fn parse_name(&self, i: usize) -> Result<(usize, String), LexBuildError> { + match RE_NAME.find(&self.src[i..]) { + Some(m) => { + assert_eq!(m.start(), 0); + Ok((i + m.end(), self.src[i..i + m.end()].to_string())) + } + None => Err(LexBuildError { + kind: LexErrorKind::InvalidGrmtoolsSectionValue, + spans: vec![Span::new(i, i)], + }), + } + } + fn parse_spaces(&mut self, i: usize) -> LexInternalBuildResult { Ok(RE_LEADING_SPACE_SEPS .find(&self.src[i..])