Skip to content

Commit 5c96dbd

Browse files
committed
parser, lexer: move explicit ident qualification to parser.
1 parent d71440c commit 5c96dbd

6 files changed

Lines changed: 155 additions & 132 deletions

File tree

lexer/src/lib.rs

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,12 @@ pub enum Token<'a> {
209209
RlengthVariable,
210210
#[token("ENVIRON", accept_expression)]
211211
EnvironVariable,
212-
#[regex("(?&identifier)", Identifier::without_namespace::<0>)]
213-
#[regex(r"(?&identifier)::(?&identifier)", Identifier::with_namespace::<0>)]
212+
#[regex("(?&identifier)", Identifier::parse::<0>)]
214213
Identifier(Identifier<'a>),
215-
#[regex("@(?&identifier)", parse_indirect_call::<false>)]
216-
#[regex(r"@(?&identifier)::(?&identifier)", parse_indirect_call::<true>)]
214+
#[regex("@(?&identifier)", parse_indirect_call)]
217215
IndirectCall(Identifier<'a>),
216+
#[token("::")]
217+
PathSpec,
218218
#[token("+", accept_expression)]
219219
Plus,
220220
#[token("-", accept_expression)]
@@ -343,7 +343,6 @@ pub enum LexingError {
343343

344344
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
345345
pub struct Identifier<'a> {
346-
pub namespace: Option<&'a str>,
347346
pub literal: &'a str,
348347
}
349348

@@ -571,7 +570,7 @@ fn parse_num<'a>(lex: &mut Lexer<'a>) -> Token<'a> {
571570

572571
fn parse_non_posix_keyword<'a>(lex: &mut Lexer<'a>, other: Token<'a>) -> Token<'a> {
573572
if lex.extras.posix_strict {
574-
Token::Identifier(Identifier::without_namespace::<0>(lex))
573+
Token::Identifier(Identifier::parse::<0>(lex))
575574
} else {
576575
accept_expression(lex);
577576
other
@@ -592,38 +591,23 @@ fn parse_non_gnu_directive<'a>(lex: &mut Lexer<'a>) -> Result<Token<'a>> {
592591
if lex.extras.posix_strict {
593592
Err(LexingError::non_posix(lex))
594593
} else if lex.extras.gnu_strict {
595-
Ok(Token::IndirectCall(Identifier::without_namespace::<1>(lex)))
594+
Ok(Token::IndirectCall(Identifier::parse::<1>(lex)))
596595
} else {
597596
Ok(Token::ConcurrentDirective)
598597
}
599598
}
600599

601-
fn parse_indirect_call<'a, const QUALIFIED: bool>(lex: &mut Lexer<'a>) -> Result<Identifier<'a>> {
600+
fn parse_indirect_call<'a>(lex: &mut Lexer<'a>) -> Result<Identifier<'a>> {
602601
if lex.extras.posix_strict {
603602
Err(LexingError::non_posix(lex))
604-
} else if QUALIFIED {
605-
Identifier::with_namespace::<1>(lex)
606603
} else {
607-
Ok(Identifier::without_namespace::<1>(lex))
604+
Ok(Identifier::parse::<1>(lex))
608605
}
609606
}
610607

611608
impl<'a> Identifier<'a> {
612-
fn without_namespace<const SKIP: usize>(lex: &mut Lexer<'a>) -> Self {
613-
Self { namespace: None, literal: parse_ident(lex, SKIP..) }
614-
}
615-
616-
fn with_namespace<const SKIP: usize>(lex: &mut Lexer<'a>) -> Result<Self> {
617-
if lex.extras.posix_strict {
618-
Err(LexingError::non_posix(lex))
619-
} else {
620-
// SAFETY: The regex matching ensures it is present and well-formed.
621-
let separator = unsafe { memchr(b':', lex.slice()).unwrap_unchecked() };
622-
Ok(Self {
623-
namespace: Some(parse_ident(lex, SKIP..separator)),
624-
literal: parse_ident(lex, separator + 2..),
625-
})
626-
}
609+
fn parse<const SKIP: usize>(lex: &mut Lexer<'a>) -> Self {
610+
Self { literal: parse_ident(lex, SKIP..) }
627611
}
628612
}
629613

lexer/src/tests.rs

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,7 @@ fn lexer_test_uu_extensions() {
101101
let arena = Bump::new();
102102
assert_eq!(
103103
lex(b"@concurrent", &arena, false, true),
104-
&[Token::IndirectCall(Identifier {
105-
namespace: None,
106-
literal: "concurrent"
107-
})]
104+
&[Token::IndirectCall(Identifier { literal: "concurrent" })]
108105
);
109106
}
110107

@@ -114,8 +111,8 @@ fn lexer_test_gnu_pattern() {
114111
assert_eq!(
115112
&lex(b"BEGINFILE ENDFILE", &arena, true, false),
116113
&[
117-
Token::Identifier(Identifier { namespace: None, literal: "BEGINFILE" }),
118-
Token::Identifier(Identifier { namespace: None, literal: "ENDFILE" })
114+
Token::Identifier(Identifier { literal: "BEGINFILE" }),
115+
Token::Identifier(Identifier { literal: "ENDFILE" })
119116
]
120117
);
121118
}
@@ -166,13 +163,14 @@ fn lexer_test_ident_rules_non_posix() {
166163
&lex(b"1a::a a::1a _a", &arena, false, false),
167164
&[
168165
Token::Integer(1),
169-
Token::Identifier(Identifier { namespace: Some("a"), literal: "a" }),
170-
Token::Identifier(Identifier { namespace: None, literal: "a" }),
171-
Token::Colon,
172-
Token::Colon,
166+
Token::Identifier(Identifier { literal: "a" }),
167+
Token::PathSpec,
168+
Token::Identifier(Identifier { literal: "a" }),
169+
Token::Identifier(Identifier { literal: "a" }),
170+
Token::PathSpec,
173171
Token::Integer(1),
174-
Token::Identifier(Identifier { namespace: None, literal: "a" }),
175-
Token::Identifier(Identifier { namespace: None, literal: "_a" })
172+
Token::Identifier(Identifier { literal: "a" }),
173+
Token::Identifier(Identifier { literal: "_a" })
176174
]
177175
);
178176
}
@@ -203,7 +201,7 @@ fn lexer_test_general_tokens() {
203201
Token::BeginPattern,
204202
Token::OpenBrace,
205203
Token::Print,
206-
Token::Identifier(Identifier { namespace: None, literal: "a" }),
204+
Token::Identifier(Identifier { literal: "a" }),
207205
Token::Plus,
208206
Token::Integer(1),
209207
Token::ClosedBrace,
@@ -216,7 +214,9 @@ fn lexer_test_general_tokens() {
216214
Token::Record,
217215
Token::Integer(1),
218216
Token::EqualTo,
219-
Token::Identifier(Identifier { namespace: Some("foo"), literal: "bar" }),
217+
Token::Identifier(Identifier { literal: "foo" }),
218+
Token::PathSpec,
219+
Token::Identifier(Identifier { literal: "bar" }),
220220
Token::ClosedBrace,
221221
Token::Newline
222222
]
@@ -232,7 +232,7 @@ fn lexer_test_regex_ambiguity() {
232232
Token::Integer(1),
233233
Token::SlashAssign,
234234
Token::Number(1.),
235-
Token::Identifier(Identifier { namespace: None, literal: "a" }),
235+
Token::Identifier(Identifier { literal: "a" }),
236236
Token::SlashAssign,
237237
Token::Integer(1)
238238
]
@@ -394,7 +394,7 @@ fn lexer_test_slash_assign() {
394394
assert_eq!(
395395
&lex(b"a/=1", &arena, false, false),
396396
&[
397-
Token::Identifier(Identifier { namespace: None, literal: "a" }),
397+
Token::Identifier(Identifier { literal: "a" }),
398398
Token::SlashAssign,
399399
Token::Integer(1),
400400
]
@@ -482,8 +482,10 @@ fn lexer_test_indirect_call() {
482482
assert_eq!(
483483
&lex(b"@foo @ns::bar", &arena, false, false),
484484
&[
485-
Token::IndirectCall(Identifier { namespace: None, literal: "foo" }),
486-
Token::IndirectCall(Identifier { namespace: Some("ns"), literal: "bar" }),
485+
Token::IndirectCall(Identifier { literal: "foo" }),
486+
Token::IndirectCall(Identifier { literal: "ns" }),
487+
Token::PathSpec,
488+
Token::Identifier(Identifier { literal: "bar" })
487489
]
488490
);
489491
}
@@ -532,7 +534,7 @@ fn lexer_test_regex_literals() {
532534
assert_eq!(
533535
&lex(b"x~/dot+/", &arena, false, false),
534536
&[
535-
Token::Identifier(Identifier { namespace: None, literal: "x" }),
537+
Token::Identifier(Identifier { literal: "x" }),
536538
Token::Matching,
537539
Token::Regex(b"dot+".into()),
538540
]
@@ -548,7 +550,7 @@ fn lexer_test_switch_snippet() {
548550
&[
549551
Token::Switch,
550552
Token::OpenParent,
551-
Token::Identifier(Identifier { namespace: None, literal: "x" }),
553+
Token::Identifier(Identifier { literal: "x" }),
552554
Token::ClosedParent,
553555
Token::OpenBrace,
554556
Token::Case,
@@ -573,7 +575,7 @@ fn lexer_test_getline_redirection() {
573575
&[
574576
Token::Getline,
575577
Token::Getline,
576-
Token::Identifier(Identifier { namespace: None, literal: "x" }),
578+
Token::Identifier(Identifier { literal: "x" }),
577579
Token::LesserThan,
578580
Token::String(b"f".into()),
579581
Token::String(b"cmd".into()),
@@ -623,10 +625,7 @@ fn lexer_test_func_keyword_posix() {
623625
let arena = Bump::new();
624626
assert_eq!(
625627
&lex(b"func", &arena, true, false),
626-
&[Token::Identifier(Identifier {
627-
namespace: None,
628-
literal: "func"
629-
})]
628+
&[Token::Identifier(Identifier { literal: "func" })]
630629
);
631630
}
632631

parser/src/diagnostics.rs

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
// files that was distributed with this source code.
55

66
use ariadne::{Color, Label, Report, ReportKind, Source};
7+
use either::Either;
78
use lexer::{LexingError, Span};
89
use thiserror::Error;
910

@@ -59,8 +60,8 @@ pub enum ParsingError {
5960
FunctionCallSeparatedIdent(Span),
6061
#[error("Missing closing parenthesis `(` in function call to `{}`.", .1)]
6162
FunctionCallUnclosed(Span, String),
62-
#[error("Expected to be an identifier.")]
63-
ExpectedIdentifier(Span),
63+
#[error("Expected to be a valid identifier.")]
64+
ExpectedIdentifier(Span, Option<Either<Span, Span>>),
6465
#[error("Expected an unary operation.")]
6566
ExpectedUnaryOperator(Span),
6667
#[error("Expected a binary operation")]
@@ -113,7 +114,7 @@ impl ParsingError {
113114
Self::FunctionCallMissingParenthesis(span) => Some(span.clone()),
114115
Self::FunctionCallSeparatedIdent(span) => Some(span.clone()),
115116
Self::FunctionCallUnclosed(span, _) => Some(span.clone()),
116-
Self::ExpectedIdentifier(span) => Some(span.clone()),
117+
Self::ExpectedIdentifier(span, _) => Some(span.clone()),
117118
Self::ExpectedUnaryOperator(span) => Some(span.clone()),
118119
Self::ExpectedBinaryOperator(span) => Some(span.clone()),
119120
Self::ExpectedPlaceOperator(span) => Some(span.clone()),
@@ -155,27 +156,55 @@ impl ParsingError {
155156
"This is only valid in some contexts, like a right-hand assignment or a function argument.",
156157
),
157158
Self::NonAssociativeOperator(_) => Some(
158-
"Some operators can't be chained to avoid logical errors, such as comparison ones.\nExample: write `a == b && b == c` instead of `a == b == c`.",
159+
"Some operators can't be chained to avoid logical errors, such as comparison ones.\n\
160+
Example: write `a == b && b == c` instead of `a == b == c`.",
161+
),
162+
Self::ExpectedIdentifier(_, _) => Some(
163+
"Valid identifiers are sequences of ASCII letters, numbers and underscores, not \
164+
starting with a number.\nAdditionally, these must not match keywords (`if`, \
165+
`while`, etc.) and built-in functions.\n\nNote: qualified identifiers, like \
166+
`foo::bar`, must not have spaces around the `::`.",
159167
),
160168
_ => None,
161169
}
162170
}
171+
fn secondary(&self) -> Option<(&'static str, Span, i32)> {
172+
match self {
173+
Self::ExpectedIdentifier(_, Some(span)) => Some((
174+
"Unexpected space.",
175+
span.clone().into_inner(),
176+
2 * span.is_left() as i32,
177+
)),
178+
_ => None,
179+
}
180+
}
163181
}
164182

165183
pub fn report_error<'a>(
166184
error: ParsingError,
167185
name: &'a str,
168186
source: &'a [u8],
169187
) -> super::AriadneErr<'a> {
188+
// TODO: invert the interface, so error types set the diagnostic labels.
189+
// TODO: use a shared ariadne instance so we can also emit warnings.
170190
let span = error.span().unwrap_or(source.len()..source.len());
171191
let source = str::from_utf8(source).unwrap();
172192
let mut report = Report::build(ReportKind::Error, (name, span.clone()))
173193
.with_message("Syntax error")
174194
.with_label(
175195
Label::new((name, span.clone()))
176196
.with_message(format!("{error}"))
177-
.with_color(Color::Red),
197+
.with_color(Color::Red)
198+
.with_order(1),
199+
);
200+
if let Some((str, span, order)) = error.secondary() {
201+
report.add_label(
202+
Label::new((name, span))
203+
.with_message(str)
204+
.with_color(Color::Yellow)
205+
.with_order(order),
178206
);
207+
}
179208
if let Some(str) = error.hint() {
180209
report.set_help(str);
181210
}

parser/src/lex.rs

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use crate::{
1717
pub struct Lexer<'a> {
1818
inner: Peekable<SpannedIter<'a, Token<'a>>>,
1919
span: Span,
20-
// source: &'a [u8],
20+
source: &'a [u8],
2121
}
2222

2323
type LexItem<'a> = <Lexer<'a> as Iterator>::Item;
@@ -28,7 +28,7 @@ impl<'a> Lexer<'a> {
2828
// TODO: wire in POSIX & GNU strict conformance.
2929
inner: Token::lex(source, arena, false, false).spanned().peekable(),
3030
span: Span::default(),
31-
// source,
31+
source,
3232
}
3333
}
3434

@@ -77,9 +77,9 @@ impl<'a> Lexer<'a> {
7777
{
7878
Ok(ident)
7979
} else {
80-
Err(ParsingError::UnexpectedToken(
80+
Err(ParsingError::ExpectedIdentifier(
8181
self.peeked_span().unwrap_or(self.span()),
82-
"expected an identifier.".into(),
82+
None,
8383
))
8484
}
8585
}
@@ -182,6 +182,13 @@ impl<'a> Lexer<'a> {
182182
pub fn is_yuxtaposed(&mut self) -> bool {
183183
self.peeked_span().is_ok_and(|x| x.start == self.span.end)
184184
}
185+
186+
/// # Safety
187+
///
188+
/// The current token must match UTF-8 source.
189+
pub unsafe fn src_as_str(&self) -> &'a str {
190+
unsafe { str::from_utf8_unchecked(&self.source[self.span()]) }
191+
}
185192
}
186193

187194
impl<'a> Iterator for Lexer<'a> {
@@ -197,6 +204,7 @@ pub trait TokenExt {
197204
fn is_prefix_op(&self) -> bool;
198205
fn is_atom(&self) -> bool;
199206
fn is_expr_start(&self) -> bool;
207+
fn is_ident_place(&self) -> bool;
200208
fn is_place(&self) -> bool;
201209
fn is_pattern_start(&self) -> bool;
202210
fn maps_to_command(&self) -> Option<Command>;
@@ -233,10 +241,11 @@ impl TokenExt for Token<'_> {
233241
Token::IndirectCall(_) | Token::Getline | Token::OpenParent
234242
)
235243
}
236-
fn is_place(&self) -> bool {
244+
fn is_ident_place(&self) -> bool {
237245
matches!(
238246
self,
239-
Token::NrVariable
247+
Token::Identifier(_)
248+
| Token::NrVariable
240249
| Token::NfVariable
241250
| Token::FsVariable
242251
| Token::RsVariable
@@ -251,10 +260,11 @@ impl TokenExt for Token<'_> {
251260
| Token::RstartVariable
252261
| Token::RlengthVariable
253262
| Token::EnvironVariable
254-
| Token::Identifier(_)
255-
| Token::Record
256263
)
257264
}
265+
fn is_place(&self) -> bool {
266+
matches!(self, Token::Record) || self.is_ident_place()
267+
}
258268
fn is_pattern_start(&self) -> bool {
259269
self.is_expr_start() || self.maps_to_special_pat().is_some()
260270
}

0 commit comments

Comments
 (0)