Skip to content

Commit dfdc525

Browse files
committed
Unify lifetime and identifier parsing
1 parent c461182 commit dfdc525

8 files changed

Lines changed: 119 additions & 54 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4450,6 +4450,7 @@ dependencies = [
44504450
"thin-vec",
44514451
"tracing",
44524452
"unicode-normalization",
4453+
"unicode-properties",
44534454
"unicode-width 0.2.2",
44544455
]
44554456

compiler/rustc_lexer/src/lib.rs

Lines changed: 19 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,7 @@ pub enum TokenKind {
140140

141141
/// A lifetime, e.g. `'a`.
142142
Lifetime {
143-
starts_with_number: bool,
144-
has_emoji: bool,
143+
invalid: bool,
145144
},
146145

147146
/// `;`
@@ -585,7 +584,7 @@ impl<'a> Cursor<'a> {
585584
let kind = RawStr { n_hashes: res.ok() };
586585
Literal { kind, suffix_start }
587586
}
588-
_ => self.ident_or_unknown_prefix(),
587+
_ => self.ident_or_unknown_prefix(false),
589588
},
590589

591590
// Byte literal, byte string literal, raw byte string literal or identifier.
@@ -604,7 +603,7 @@ impl<'a> Cursor<'a> {
604603

605604
// Identifier (this should be checked after other variant that can
606605
// start as identifier).
607-
c if is_id_start(c) => self.ident_or_unknown_prefix(),
606+
c if is_id_start(c) => self.ident_or_unknown_prefix(false),
608607

609608
// Numeric literal.
610609
c @ '0'..='9' => {
@@ -662,7 +661,7 @@ impl<'a> Cursor<'a> {
662661
Literal { kind, suffix_start }
663662
}
664663
// Identifier starting with an emoji. Only lexed for graceful error recovery.
665-
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
664+
c if is_emoji(c) => self.invalid_ident(),
666665
_ => Unknown,
667666
};
668667
if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
@@ -833,25 +832,22 @@ impl<'a> Cursor<'a> {
833832
RawIdent
834833
}
835834

836-
fn ident_or_unknown_prefix(&mut self) -> TokenKind {
837-
debug_assert!(is_id_start(self.prev()));
835+
fn ident_or_unknown_prefix(&mut self, already_invalid: bool) -> TokenKind {
836+
debug_assert!(is_id_start(self.prev()) || already_invalid);
838837
// Start is already eaten, eat the rest of identifier.
839838
self.eat_while(is_id_continue);
840839
// Known prefixes must have been handled earlier. So if
841840
// we see a prefix here, it is definitely an unknown prefix.
842841
match self.first() {
843842
'#' | '"' | '\'' => UnknownPrefix,
844-
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
843+
c if is_emoji(c) => self.invalid_ident(),
845844
_ => Ident,
846845
}
847846
}
848847

849848
fn invalid_ident(&mut self) -> TokenKind {
850849
// Start is already eaten, eat the rest of identifier.
851-
self.eat_while(|c| {
852-
const ZERO_WIDTH_JOINER: char = '\u{200d}';
853-
is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
854-
});
850+
self.eat_while(|c| is_id_continue(c) || is_emoji(c));
855851
// An invalid identifier followed by '#' or '"' or '\'' could be
856852
// interpreted as an invalid literal prefix. We don't bother doing that
857853
// because the treatment of invalid identifiers and invalid prefixes
@@ -896,7 +892,7 @@ impl<'a> Cursor<'a> {
896892
let kind = mk_kind_raw(res.ok());
897893
Literal { kind, suffix_start }
898894
}
899-
_ => self.ident_or_unknown_prefix(),
895+
_ => self.ident_or_unknown_prefix(false),
900896
}
901897
}
902898

@@ -976,7 +972,7 @@ impl<'a> Cursor<'a> {
976972
fn lifetime_or_char(&mut self) -> TokenKind {
977973
debug_assert!(self.prev() == '\'');
978974

979-
let mut has_emoji = false;
975+
let mut invalid = false;
980976
let can_be_a_lifetime = if self.second() == '\'' {
981977
// It's surely not a lifetime.
982978
false
@@ -985,11 +981,9 @@ impl<'a> Cursor<'a> {
985981
// Also check if it's a number for a better error reporting (so '0 will
986982
// be reported as invalid lifetime and not as unterminated char literal).
987983
let c = self.first();
988-
let is_emoji = !c.is_ascii() && c.is_emoji_char();
989-
if is_emoji {
990-
has_emoji = true;
991-
}
992-
is_id_start(c) || c.is_ascii_digit() || is_emoji
984+
invalid |= c.is_ascii_digit();
985+
invalid |= is_emoji(c);
986+
is_id_start(c) || invalid
993987
};
994988

995989
if !can_be_a_lifetime {
@@ -1019,13 +1013,7 @@ impl<'a> Cursor<'a> {
10191013
// First symbol can be a number (which isn't a valid identifier start),
10201014
// so skip it without any checks.
10211015
self.bump();
1022-
self.eat_while(|c| {
1023-
let is_emoji = !c.is_ascii() && c.is_emoji_char();
1024-
if is_emoji {
1025-
has_emoji = true;
1026-
}
1027-
is_id_continue(c) || is_emoji
1028-
});
1016+
invalid |= matches!(self.ident_or_unknown_prefix(invalid), InvalidIdent);
10291017

10301018
match self.first() {
10311019
// Check if after skipping literal contents we've met a closing
@@ -1037,7 +1025,7 @@ impl<'a> Cursor<'a> {
10371025
Literal { kind, suffix_start: self.pos_within_token() }
10381026
}
10391027
'#' if !starts_with_number => UnknownPrefixLifetime,
1040-
_ => Lifetime { starts_with_number, has_emoji },
1028+
_ => Lifetime { invalid },
10411029
}
10421030
}
10431031

@@ -1290,3 +1278,7 @@ impl<'a> Cursor<'a> {
12901278
self.eat_while(is_id_continue);
12911279
}
12921280
}
1281+
1282+
fn is_emoji(c: char) -> bool {
1283+
!c.is_ascii() && c.is_emoji_char()
1284+
}

compiler/rustc_parse/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ rustc_span = { path = "../rustc_span" }
2020
thin-vec = "0.2.12"
2121
tracing = "0.1"
2222
unicode-normalization = "0.1.25"
23+
unicode-properties = { version = "0.1.4", default-features = false, features = ["emoji"] }
2324
unicode-width = "0.2.2"
2425
# tidy-alphabetical-end
2526

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use rustc_session::lint::builtin::{
1717
use rustc_session::parse::ParseSess;
1818
use rustc_span::{BytePos, Pos, Span, Symbol, sym};
1919
use tracing::debug;
20+
use unicode_properties::emoji::UnicodeEmoji;
2021

2122
use crate::errors;
2223
use crate::lexer::diagnostics::TokenTreeDiagInfo;
@@ -316,21 +317,62 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
316317
self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
317318
token::Literal(token::Lit { kind, symbol, suffix })
318319
}
319-
rustc_lexer::TokenKind::Lifetime { starts_with_number, has_emoji } => {
320+
rustc_lexer::TokenKind::Lifetime { invalid } => {
320321
// Include the leading `'` in the real identifier, for macro
321322
// expansion purposes. See #12512 for the gory details of why
322323
// this is necessary.
323324
let lifetime_name = nfc_normalize(self.str_from(start));
324325
self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
325326
let span = self.mk_sp(start, self.pos);
326-
if starts_with_number {
327-
self.dcx()
328-
.struct_err("lifetimes cannot start with a number")
329-
.with_span(span)
330-
.stash(span, StashKey::LifetimeIsChar);
331-
}
332-
if has_emoji {
333-
self.dcx().struct_span_err(span, "lifetimes cannot contain emoji").emit();
327+
if invalid {
328+
let name = lifetime_name.as_str();
329+
// skip(1) to skip the `'`
330+
let starts_with_number = matches!(
331+
name.chars().skip(1).next(),
332+
Some(c) if c.is_ascii_digit()
333+
);
334+
let mut emoji = vec![];
335+
for (i, c) in name.char_indices().skip(1) {
336+
let i = i as u32;
337+
if !c.is_ascii() && c.is_emoji_char() {
338+
let lo = start + BytePos(i);
339+
emoji.push(self.mk_sp(lo, lo + Pos::from_usize(c.len_utf8())));
340+
}
341+
}
342+
let err = match (starts_with_number, &emoji[..]) {
343+
(false, []) => {
344+
unreachable!("lifetime {name:?} incorrectly marked as invalid?");
345+
}
346+
(true, []) if name.len() > 2 => {
347+
// Point at the first lifetime name character.
348+
let start_span = self.mk_sp(start + BytePos(1), start + BytePos(2));
349+
self.dcx()
350+
.struct_err(format!(
351+
"lifetimes cannot start with a number: `{name}`"
352+
))
353+
.with_span(start_span)
354+
.with_span_label(span, "")
355+
}
356+
(true, []) => {
357+
// Point at the whole lifetime name.
358+
self.dcx()
359+
.struct_err(format!(
360+
"lifetimes cannot start with a number: `{name}`"
361+
))
362+
.with_span(span)
363+
}
364+
(false, [_, ..]) => self.dcx()
365+
.struct_err(format!("lifetimes cannot have emoji: `{name}`"))
366+
.with_span(emoji.clone())
367+
.with_span_label(span, ""),
368+
(true, [_, ..]) => self.dcx()
369+
.struct_err(format!(
370+
"invalid lifetime name: `{}`",
371+
name.escape_default(),
372+
))
373+
.with_span(span),
374+
};
375+
err.stash(span, StashKey::LifetimeIsChar);
334376
}
335377
token::Lifetime(lifetime_name, IdentIsRaw::No)
336378
}
Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,20 @@
11
// #141081
2-
fn bad_lifetime_name<'🐛🐛🐛family👨‍👩‍👧‍👦>(_: &'🐛🐛🐛family👨‍👩‍👧‍👦 ()) {}
3-
//~^ ERROR: lifetimes cannot contain emoji
4-
//~| ERROR: lifetimes cannot contain emoji
2+
fn bad_lifetime_name<
3+
'🐛🐛🐛family👨‍👩‍👧‍👦,//~ ERROR: lifetimes cannot have emoji
4+
'12, //~ ERROR: lifetimes cannot start with a number
5+
'a🐛, //~ ERROR: lifetimes cannot have emoji
6+
'1🐛, //~ ERROR: invalid lifetime name
7+
'1, //~ ERROR: lifetimes cannot start with a number
8+
'a‌b // bare zero-width-joiners are accepted as XID_Continue
9+
>() {}
10+
11+
12+
13+
14+
15+
516
fn main() {
6-
'🐛: { //~ ERROR: lifetimes cannot contain emoji
17+
'🐛: { //~ ERROR: lifetimes cannot have emoji
718
todo!();
819
};
920
}
Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,38 @@
1-
error: lifetimes cannot contain emoji
2-
--> $DIR/emoji-in-lifetime.rs:2:22
1+
error: lifetimes cannot have emoji: `'🐛🐛🐛family👨👩👧👦`
2+
--> $DIR/emoji-in-lifetime.rs:3:6
33
|
4-
LL | fn bad_lifetime_name<'🐛🐛🐛family👨👩👧👦>(_: &'🐛🐛🐛family👨👩👧👦 ()) {}
5-
| ^^^^^^^^^^^^^^^^^^^^^
4+
LL | '🐛🐛🐛family👨👩👧👦,
5+
| -^^^^^^------^^^^^^^^
66

7-
error: lifetimes cannot contain emoji
8-
--> $DIR/emoji-in-lifetime.rs:2:45
7+
error: lifetimes cannot start with a number: `'12`
8+
--> $DIR/emoji-in-lifetime.rs:4:6
99
|
10-
LL | fn bad_lifetime_name<'🐛🐛🐛family👨👩👧👦>(_: &'🐛🐛🐛family👨👩👧👦 ()) {}
11-
| ^^^^^^^^^^^^^^^^^^^^^
10+
LL | '12,
11+
| -^-
1212

13-
error: lifetimes cannot contain emoji
13+
error: lifetimes cannot have emoji: `'a🐛`
14+
--> $DIR/emoji-in-lifetime.rs:5:7
15+
|
16+
LL | 'a🐛,
17+
| --^^
18+
19+
error: invalid lifetime name: `\'1\u{1f41b}`
1420
--> $DIR/emoji-in-lifetime.rs:6:5
1521
|
22+
LL | '1🐛,
23+
| ^^^^
24+
25+
error: lifetimes cannot start with a number: `'1`
26+
--> $DIR/emoji-in-lifetime.rs:7:5
27+
|
28+
LL | '1,
29+
| ^^
30+
31+
error: lifetimes cannot have emoji: `'🐛`
32+
--> $DIR/emoji-in-lifetime.rs:17:6
33+
|
1634
LL | '🐛: {
17-
| ^^^
35+
| -^^
1836

19-
error: aborting due to 3 previous errors
37+
error: aborting due to 6 previous errors
2038

tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ LL - println!('1 + 1');
1010
LL + println!("1 + 1");
1111
|
1212

13-
error: lifetimes cannot start with a number
13+
error: lifetimes cannot start with a number: `'1`
1414
--> $DIR/lex-bad-str-literal-as-char-1.rs:3:14
1515
|
1616
LL | println!('1 + 1');

tests/ui/parser/numeric-lifetime.stderr

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ LL | let x: usize = "";
66
| |
77
| expected due to this
88

9-
error: lifetimes cannot start with a number
9+
error: lifetimes cannot start with a number: `'1`
1010
--> $DIR/numeric-lifetime.rs:1:10
1111
|
1212
LL | struct S<'1> { s: &'1 usize }
1313
| ^^
1414

15-
error: lifetimes cannot start with a number
15+
error: lifetimes cannot start with a number: `'1`
1616
--> $DIR/numeric-lifetime.rs:1:20
1717
|
1818
LL | struct S<'1> { s: &'1 usize }

0 commit comments

Comments
 (0)