Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 94 additions & 9 deletions lrlex/src/lib/ctbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ use cfgrammar::{
Setting, Value,
},
markmap::MergeBehavior,
span::Location,
span::{Location, Span},
yacc::YaccGrammar,
};
use glob::glob;
use lazy_static::lazy_static;
Expand Down Expand Up @@ -497,7 +498,7 @@ where
if let Some(inspect_lexerkind_cb) = self.inspect_lexerkind_cb {
inspect_lexerkind_cb(lexerkind)?
}
let (lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) =
let (mut lexerdef, lex_flags): (LRNonStreamingLexerDef<LexerTypesT>, LexFlags) =
match lexerkind {
LexerKind::LRNonStreamingLexer => {
let lex_flags = LexFlags::try_from(&mut header)?;
Expand All @@ -521,15 +522,16 @@ where
}
};

let mut has_unallowed_missing = false;
if let Some(ref lrcfg) = self.lrpar_config {
let mut lexerdef = lexerdef.clone();
let mut closure_lexerdef = lexerdef.clone();
let mut ctp = CTParserBuilder::<LexerTypesT>::new().inspect_rt(Box::new(
move |yacc_header, rtpb, rule_ids_map, grm_path| {
let owned_map = rule_ids_map
.iter()
.map(|(x, y)| (&**x, *y))
.collect::<HashMap<_, _>>();
lexerdef.set_rule_ids(&owned_map);
closure_lexerdef.set_rule_ids(&owned_map);
yacc_header.mark_used(&"test_files".to_string());
let test_glob = yacc_header.get("test_files");
match test_glob {
Expand All @@ -540,7 +542,8 @@ where
{
let path = path?;
let input = fs::read_to_string(&path)?;
let l: LRNonStreamingLexer<LexerTypesT> = lexerdef.lexer(&input);
let l: LRNonStreamingLexer<LexerTypesT> =
closure_lexerdef.lexer(&input);
for e in rtpb.parse_noaction(&l) {
Err(format!("parsing {}: {}", path.display(), e))?
}
Expand All @@ -553,8 +556,58 @@ where
},
));
ctp = lrcfg(ctp);
let map = ctp.build()?;
self.rule_ids_map = Some(map.token_map().to_owned());
let ct_parser = ctp.build()?;
self.rule_ids_map = Some(ct_parser.token_map().to_owned());
let (missing_from_lexer, missing_from_parser) =
set_rule_ids(&mut lexerdef, ct_parser.yacc_grammar());
let yacc_diag =
SpannedDiagnosticFormatter::new(ct_parser.grammar_src(), ct_parser.grammar_path());
let err_indent = " ".repeat(ERROR.len());
if !self.allow_missing_terms_in_lexer {
if let Some(token_spans) = missing_from_lexer {
eprintln!(
"{ERROR} these tokens are not referenced in the lexer but defined as follows"
);
eprintln!(
"{err_indent} {}",
yacc_diag.file_location_msg("in the grammar", None)
);
for span in token_spans {
eprintln!(
"{}",
yacc_diag.underline_span_with_text(
span,
"Missing from lexer".to_string(),
'^'
)
);
}
eprintln!();
has_unallowed_missing = true;
}
}
if !self.allow_missing_tokens_in_parser {
if let Some(token_spans) = missing_from_parser {
eprintln!(
"{ERROR} these tokens are not referenced in the grammar but defined as follows"
);
eprintln!(
"{err_indent} {}",
lex_diag.file_location_msg("in the lexer", None)
);
for span in token_spans {
eprintln!(
"{}",
lex_diag.underline_span_with_text(
span,
"Missing from parser".to_string(),
'^'
)
);
}
has_unallowed_missing = true;
}
}
}

let mut lexerdef = Box::new(lexerdef);
Expand All @@ -565,6 +618,11 @@ where
);
}

if has_unallowed_missing {
fs::remove_file(outp).ok();
panic!();
}

let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
Some(ref rim) => {
// Convert from HashMap<String, _> to HashMap<&str, _>
Expand All @@ -580,8 +638,6 @@ where
}
None => (None, None),
};

let mut has_unallowed_missing = false;
if !self.allow_missing_terms_in_lexer {
if let Some(ref mfl) = missing_from_lexer {
eprintln!("Error: the following tokens are used in the grammar but are not defined in the lexer:");
Expand Down Expand Up @@ -1172,6 +1228,35 @@ fn indent(indent: &str, s: &str) -> String {
format!("{indent}{}\n", s.trim_end_matches('\n')).replace('\n', &format!("\n{}", indent))
}

fn set_rule_ids<ST, LexerTypesT: LexerTypes<StorageT = ST>, LT: LexerDef<LexerTypesT>>(
lexerdef: &mut LT,
grm: &YaccGrammar<ST>,
) -> (Option<Vec<Span>>, Option<Vec<Span>>)
where
ST: 'static + Copy + PrimInt + Unsigned,
usize: num_traits::AsPrimitive<ST>,
{
let rule_ids = grm
.tokens_map()
.iter()
.map(|(&n, &i)| (n, i.0))
.collect::<std::collections::HashMap<&str, ST>>();
let (missing_from_lexer, missing_from_parser) = lexerdef.set_rule_ids_spanned(&rule_ids);
let missing_from_lexer = missing_from_lexer.map(|tokens| {
tokens
.iter()
.map(|name| {
grm.token_span(*grm.tokens_map().get(name).unwrap())
.expect("Given token should have a span")
})
.collect::<Vec<_>>()
});

let missing_from_parser =
missing_from_parser.map(|tokens| tokens.iter().map(|(_, span)| *span).collect::<Vec<_>>());
(missing_from_lexer, missing_from_parser)
}

#[cfg(test)]
mod test {
use std::fs::File;
Expand Down
39 changes: 31 additions & 8 deletions lrpar/src/lib/ctbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,9 @@ where
return Ok(CTParser {
regenerated: false,
rule_ids,
yacc_grammar: grm,
grammar_src: inc,
grammar_path: self.grammar_path.unwrap(),
conflicts: None,
});
} else {
Expand Down Expand Up @@ -779,13 +782,16 @@ where
&format!("/* CACHE INFORMATION {} */\n", cache),
)?;
let conflicts = if stable.conflicts().is_some() {
Some((grm, sgraph, stable))
Some((sgraph, stable))
} else {
None
};
Ok(CTParser {
regenerated: true,
rule_ids,
yacc_grammar: grm,
grammar_src: inc,
grammar_path: self.grammar_path.unwrap(),
conflicts,
})
}
Expand Down Expand Up @@ -1489,11 +1495,10 @@ where
{
regenerated: bool,
rule_ids: HashMap<String, StorageT>,
Copy link
Copy Markdown
Collaborator Author

@ratmice ratmice May 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I guess I would suggest a good first step is to focus on what to do about the duplication of rule_ids here.
Since it actually impacts semver, but also contributes to the other code duplication.

The thing to note is that this patch is adding yacc_grammar field to this struct,
and rule_ids field is just

self.rule_ids = self.yacc_grammar.
            .tokens_map()
            .iter()
            .map(|(&n, &i)| (n.to_owned(), i.as_storaget()))
            .collect::<HashMap<_, _>>();

So we are essentially storing 2 copies of the rule_ids, one with the &str refs converted to_owned().
It is worth noting that in lrlex we make another owned_map clone of this value in CTLexerBuilder::build(),
as part of the existing code path.

        let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map {
            Some(ref rim) => {
                // Convert from HashMap<String, _> to HashMap<&str, _>
                let owned_map = rim
                    .iter()
                    .map(|(x, y)| (&**x, *y))
                    .collect::<HashMap<_, _>>();

The main issue I see is the semver changes to CTParser::token_map, it isn't actually possible to reimplement the token_map function using self.yacc_grammar, because of the ref-to-owned vs ref-to-ref differences.

    /// Returns a [HashMap] from lexeme string types to numeric types (e.g. `INT: 2`), suitable for
    /// handing to a lexer to coordinate the IDs of lexer and parser.
    pub fn token_map(&self) -> &HashMap<String, StorageT> {
        &self.rule_ids
    }

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dumb idea (off the top of my head): does this suggest that rule_ids should be wrapped in something like an Rc?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alternative to changing this is just keeping both the rule_ids and yacc_grammar field duplication,
and rewriting the set_rule_ids_spanned codepath from nimbleparse to use the rule_ids field,
making this new code path conform to the existing CTLexerBuilder code paths instead.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if we were going to use Rc while avoiding duplication that Rc would still need to be within YaccGrammar, the particular thing we want out of the YaccGrammar is we need to call token_span

Then because we need that, it means we also already have YaccGrammar::token_map with token_map itself based upon iter_tidxs and also allocating a HashMap.

I suppose that would leave us with:

  1. Alloc a HashMap in YaccGrammar::token_map
  2. A rule_ids: Rc<HashMap<...>> shared by CTParserBuilder/CTLexerBuilder

So I guess this is also a variation of keeping both the rule_ids and yacc_grammar fields.

Copy link
Copy Markdown
Collaborator Author

@ratmice ratmice May 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I guess another thing to take into account, we have these two functions here:

https://docs.rs/lrlex/latest/lrlex/trait.LexerDef.html#tymethod.set_rule_ids
https://docs.rs/lrlex/latest/lrlex/trait.LexerDef.html#tymethod.set_rule_ids_spanned

Which are the ultimate destination for both these variations, so another question I have is if we had versions
of those functions which accepted an IntoIterator<Item=(&str, LexerTypesT::StorageT)> as a parameter instead of the &HashMap<&str, LexerTypesT::StorageT>, maybe it would be possible to get rid of all these allocations entirely?

Edit: This may be a dumb idea, it looks like both set_rule_ids variations use the hashmaps to actually do lookups, so the simplest implementation of these functions taking an IntoIterator would actually use HashMap::from_iter()

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In f4a42cb I tried messing with the iterator API...

Returning a Box<HashMap> from token_map in place of the rule_id field, I'm not sure if the iterator based API pulls it's weight really, we still end up with clones for backwards compatibility of return values, and the CTLexerBuilder::rule_ids_map function.

That said it seems like a bit of an improvement over the original PR implementation, but maybe it's not worth trying to remove the duplicated values which we can't seem to benefit from fully without more drastic semver breakage.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, where we break semver, I'd hope to keep it to "nice" parts of the API. This one does feel like it will intrude on pretty much every compile-time user? I suspect, though, there is a nice API here, and it's just eluding us right now...

Copy link
Copy Markdown
Collaborator Author

@ratmice ratmice May 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the actual breakage is pretty small, because of the signature for

CTLexerBuilder::rule_ids_map
has std::borrow::Borrow<HashMap<String, LexerTypesT::StorageT>,

The only change I had to make to any compile-time stuff was the manual lexer,
because ct_token_map uses &HashMap in it's signature rather than the Borrow<...>. Perhaps we can switch that to use Borrow<> to further reduce breakage.

Also, I hadn't noticed that all of CTLexer's fields and methods are private so changing the types of those fields isn't actually a semver break like I had thought.

Copy link
Copy Markdown
Collaborator Author

@ratmice ratmice May 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I did 8b973f2, but I'm starting to think that in this PR I should retreat from removing duplicate fields, and reducing allocations.

I think if we allow the duplicate rule_id field, we maybe can get something working without any semver breakage, or code duplication, very similar to how this patch is now, but without the token_map or LexerDef::set_rule_ids_spanned_iter change.

Then we can potentially revisit all that in a separate patch?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Oops, s/nice/niche/. That changed the meaning of that sentence quite profoundly. Sorry!]

conflicts: Option<(
YaccGrammar<StorageT>,
StateGraph<StorageT>,
StateTable<StorageT>,
)>,
yacc_grammar: YaccGrammar<StorageT>,
grammar_src: String,
grammar_path: PathBuf,
conflicts: Option<(StateGraph<StorageT>, StateTable<StorageT>)>,
}

impl<StorageT> CTParser<StorageT>
Expand Down Expand Up @@ -1527,11 +1532,29 @@ where
&StateTable<StorageT>,
&Conflicts<StorageT>,
)> {
if let Some((grm, sgraph, stable)) = &self.conflicts {
return Some((grm, sgraph, stable, stable.conflicts().unwrap()));
if let Some((sgraph, stable)) = &self.conflicts {
return Some((
&self.yacc_grammar,
sgraph,
stable,
stable.conflicts().unwrap(),
));
}
None
}

#[doc(hidden)]
pub fn yacc_grammar(&self) -> &YaccGrammar<StorageT> {
&self.yacc_grammar
}
#[doc(hidden)]
pub fn grammar_src(&self) -> &str {
&self.grammar_src
}
#[doc(hidden)]
pub fn grammar_path(&self) -> &Path {
self.grammar_path.as_path()
}
}

/// Indents a multi-line string and trims any trailing newline.
Expand Down