Unify lifetime and identifier parsing

estebank · estebank · commit dfdc525ff244 · 2026-03-18T18:11:05.000Z
diff --git a/Cargo.lock b/Cargo.lock
@@ -4450,6 +4450,7 @@ dependencies = [
  "thin-vec",
  "tracing",
  "unicode-normalization",
+ "unicode-properties",
  "unicode-width 0.2.2",
 ]
 
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -140,8 +140,7 @@ pub enum TokenKind {
 
     /// A lifetime, e.g. `'a`.
     Lifetime {
-        starts_with_number: bool,
-        has_emoji: bool,
+        invalid: bool,
     },
 
     /// `;`
@@ -585,7 +584,7 @@ impl<'a> Cursor<'a> {
                     let kind = RawStr { n_hashes: res.ok() };
                     Literal { kind, suffix_start }
                 }
-                _ => self.ident_or_unknown_prefix(),
+                _ => self.ident_or_unknown_prefix(false),
             },
 
             // Byte literal, byte string literal, raw byte string literal or identifier.
@@ -604,7 +603,7 @@ impl<'a> Cursor<'a> {
 
             // Identifier (this should be checked after other variant that can
             // start as identifier).
-            c if is_id_start(c) => self.ident_or_unknown_prefix(),
+            c if is_id_start(c) => self.ident_or_unknown_prefix(false),
 
             // Numeric literal.
             c @ '0'..='9' => {
@@ -662,7 +661,7 @@ impl<'a> Cursor<'a> {
                 Literal { kind, suffix_start }
             }
             // Identifier starting with an emoji. Only lexed for graceful error recovery.
-            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
+            c if is_emoji(c) => self.invalid_ident(),
             _ => Unknown,
         };
         if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
@@ -833,25 +832,22 @@ impl<'a> Cursor<'a> {
         RawIdent
     }
 
-    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
-        debug_assert!(is_id_start(self.prev()));
+    fn ident_or_unknown_prefix(&mut self, already_invalid: bool) -> TokenKind {
+        debug_assert!(is_id_start(self.prev()) || already_invalid);
         // Start is already eaten, eat the rest of identifier.
         self.eat_while(is_id_continue);
         // Known prefixes must have been handled earlier. So if
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
-            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
+            c if is_emoji(c) => self.invalid_ident(),
             _ => Ident,
         }
     }
 
     fn invalid_ident(&mut self) -> TokenKind {
         // Start is already eaten, eat the rest of identifier.
-        self.eat_while(|c| {
-            const ZERO_WIDTH_JOINER: char = '\u{200d}';
-            is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
-        });
+        self.eat_while(|c| is_id_continue(c) || is_emoji(c));
         // An invalid identifier followed by '#' or '"' or '\'' could be
         // interpreted as an invalid literal prefix. We don't bother doing that
         // because the treatment of invalid identifiers and invalid prefixes
@@ -896,7 +892,7 @@ impl<'a> Cursor<'a> {
                 let kind = mk_kind_raw(res.ok());
                 Literal { kind, suffix_start }
             }
-            _ => self.ident_or_unknown_prefix(),
+            _ => self.ident_or_unknown_prefix(false),
         }
     }
 
@@ -976,7 +972,7 @@ impl<'a> Cursor<'a> {
     fn lifetime_or_char(&mut self) -> TokenKind {
         debug_assert!(self.prev() == '\'');
 
-        let mut has_emoji = false;
+        let mut invalid = false;
         let can_be_a_lifetime = if self.second() == '\'' {
             // It's surely not a lifetime.
             false
@@ -985,11 +981,9 @@ impl<'a> Cursor<'a> {
             // Also check if it's a number for a better error reporting (so '0 will
             // be reported as invalid lifetime and not as unterminated char literal).
             let c = self.first();
-            let is_emoji = !c.is_ascii() && c.is_emoji_char();
-            if is_emoji {
-                has_emoji = true;
-            }
-            is_id_start(c) || c.is_ascii_digit() || is_emoji
+            invalid |= c.is_ascii_digit();
+            invalid |= is_emoji(c);
+            is_id_start(c) || invalid
         };
 
         if !can_be_a_lifetime {
@@ -1019,13 +1013,7 @@ impl<'a> Cursor<'a> {
         // First symbol can be a number (which isn't a valid identifier start),
         // so skip it without any checks.
         self.bump();
-        self.eat_while(|c| {
-            let is_emoji = !c.is_ascii() && c.is_emoji_char();
-            if is_emoji {
-                has_emoji = true;
-            }
-            is_id_continue(c) || is_emoji
-        });
+        invalid |= matches!(self.ident_or_unknown_prefix(invalid), InvalidIdent);
 
         match self.first() {
             // Check if after skipping literal contents we've met a closing
@@ -1037,7 +1025,7 @@ impl<'a> Cursor<'a> {
                 Literal { kind, suffix_start: self.pos_within_token() }
             }
             '#' if !starts_with_number => UnknownPrefixLifetime,
-            _ => Lifetime { starts_with_number, has_emoji },
+            _ => Lifetime { invalid },
         }
     }
 
@@ -1290,3 +1278,7 @@ impl<'a> Cursor<'a> {
         self.eat_while(is_id_continue);
     }
 }
+
+fn is_emoji(c: char) -> bool {
+    !c.is_ascii() && c.is_emoji_char()
+}
diff --git a/compiler/rustc_parse/Cargo.toml b/compiler/rustc_parse/Cargo.toml
@@ -20,6 +20,7 @@ rustc_span = { path = "../rustc_span" }
 thin-vec = "0.2.12"
 tracing = "0.1"
 unicode-normalization = "0.1.25"
+unicode-properties = { version = "0.1.4", default-features = false, features = ["emoji"] }
 unicode-width = "0.2.2"
 # tidy-alphabetical-end
 
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -17,6 +17,7 @@ use rustc_session::lint::builtin::{
 use rustc_session::parse::ParseSess;
 use rustc_span::{BytePos, Pos, Span, Symbol, sym};
 use tracing::debug;
+use unicode_properties::emoji::UnicodeEmoji;
 
 use crate::errors;
 use crate::lexer::diagnostics::TokenTreeDiagInfo;
@@ -316,21 +317,62 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
                     self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
                     token::Literal(token::Lit { kind, symbol, suffix })
                 }
-                rustc_lexer::TokenKind::Lifetime { starts_with_number, has_emoji } => {
+                rustc_lexer::TokenKind::Lifetime { invalid } => {
                     // Include the leading `'` in the real identifier, for macro
                     // expansion purposes. See #12512 for the gory details of why
                     // this is necessary.
                     let lifetime_name = nfc_normalize(self.str_from(start));
                     self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
                     let span = self.mk_sp(start, self.pos);
-                    if starts_with_number {
-                        self.dcx()
-                            .struct_err("lifetimes cannot start with a number")
-                            .with_span(span)
-                            .stash(span, StashKey::LifetimeIsChar);
-                    }
-                    if has_emoji {
-                        self.dcx().struct_span_err(span, "lifetimes cannot contain emoji").emit();
+                    if invalid {
+                        let name = lifetime_name.as_str();
+                        // skip(1) to skip the `'`
+                        let starts_with_number = matches!(
+                            name.chars().skip(1).next(),
+                            Some(c) if c.is_ascii_digit()
+                        );
+                        let mut emoji = vec![];
+                        for (i, c) in name.char_indices().skip(1) {
+                            let i = i as u32;
+                            if !c.is_ascii() && c.is_emoji_char() {
+                                let lo = start + BytePos(i);
+                                emoji.push(self.mk_sp(lo, lo + Pos::from_usize(c.len_utf8())));
+                            }
+                        }
+                        let err = match (starts_with_number, &emoji[..]) {
+                            (false, []) => {
+                                unreachable!("lifetime {name:?} incorrectly marked as invalid?");
+                            }
+                            (true, []) if name.len() > 2 => {
+                                // Point at the first lifetime name character.
+                                let start_span = self.mk_sp(start + BytePos(1), start + BytePos(2));
+                                self.dcx()
+                                    .struct_err(format!(
+                                        "lifetimes cannot start with a number: `{name}`"
+                                    ))
+                                    .with_span(start_span)
+                                    .with_span_label(span, "")
+                            }
+                            (true, []) => {
+                                // Point at the whole lifetime name.
+                                self.dcx()
+                                    .struct_err(format!(
+                                        "lifetimes cannot start with a number: `{name}`"
+                                    ))
+                                    .with_span(span)
+                            }
+                            (false, [_, ..]) => self.dcx()
+                                .struct_err(format!("lifetimes cannot have emoji: `{name}`"))
+                                .with_span(emoji.clone())
+                                .with_span_label(span, ""),
+                            (true, [_, ..]) => self.dcx()
+                                .struct_err(format!(
+                                    "invalid lifetime name: `{}`",
+                                    name.escape_default(),
+                                ))
+                                .with_span(span),
+                        };
+                        err.stash(span, StashKey::LifetimeIsChar);
                     }
                     token::Lifetime(lifetime_name, IdentIsRaw::No)
                 }
diff --git a/tests/ui/lexer/emoji-in-lifetime.rs b/tests/ui/lexer/emoji-in-lifetime.rs
@@ -1,9 +1,20 @@
 // #141081
-fn bad_lifetime_name<'🐛🐛🐛family👨‍👩‍👧‍👦>(_: &'🐛🐛🐛family👨‍👩‍👧‍👦 ()) {}
-//~^ ERROR: lifetimes cannot contain emoji
-//~| ERROR: lifetimes cannot contain emoji
+fn bad_lifetime_name<
+    '🐛🐛🐛family👨‍👩‍👧‍👦,//~ ERROR: lifetimes cannot have emoji
+    '12, //~ ERROR: lifetimes cannot start with a number
+    'a🐛, //~ ERROR: lifetimes cannot have emoji
+    '1🐛, //~ ERROR: invalid lifetime name
+    '1, //~ ERROR: lifetimes cannot start with a number
+    'a‌b // bare zero-width-joiners are accepted as XID_Continue
+>() {}
+
+
+
+
+
+
 fn main() {
-    '🐛: { //~ ERROR: lifetimes cannot contain emoji
+    '🐛: { //~ ERROR: lifetimes cannot have emoji
         todo!();
     };
 }
diff --git a/tests/ui/lexer/emoji-in-lifetime.stderr b/tests/ui/lexer/emoji-in-lifetime.stderr
@@ -1,20 +1,38 @@
-error: lifetimes cannot contain emoji
-  --> $DIR/emoji-in-lifetime.rs:2:22
+error: lifetimes cannot have emoji: `'🐛🐛🐛family👨👩👧👦`
+  --> $DIR/emoji-in-lifetime.rs:3:6
    |
-LL | fn bad_lifetime_name<'🐛🐛🐛family👨👩👧👦>(_: &'🐛🐛🐛family👨👩👧👦 ()) {}
-   |                      ^^^^^^^^^^^^^^^^^^^^^
+LL |     '🐛🐛🐛family👨👩👧👦,
+   |     -^^^^^^------^^^^^^^^
 
-error: lifetimes cannot contain emoji
-  --> $DIR/emoji-in-lifetime.rs:2:45
+error: lifetimes cannot start with a number: `'12`
+  --> $DIR/emoji-in-lifetime.rs:4:6
    |
-LL | fn bad_lifetime_name<'🐛🐛🐛family👨👩👧👦>(_: &'🐛🐛🐛family👨👩👧👦 ()) {}
-   |                                                 ^^^^^^^^^^^^^^^^^^^^^
+LL |     '12,
+   |     -^-
 
-error: lifetimes cannot contain emoji
+error: lifetimes cannot have emoji: `'a🐛`
+  --> $DIR/emoji-in-lifetime.rs:5:7
+   |
+LL |     'a🐛,
+   |     --^^
+
+error: invalid lifetime name: `\'1\u{1f41b}`
   --> $DIR/emoji-in-lifetime.rs:6:5
    |
+LL |     '1🐛,
+   |     ^^^^
+
+error: lifetimes cannot start with a number: `'1`
+  --> $DIR/emoji-in-lifetime.rs:7:5
+   |
+LL |     '1,
+   |     ^^
+
+error: lifetimes cannot have emoji: `'🐛`
+  --> $DIR/emoji-in-lifetime.rs:17:6
+   |
 LL |     '🐛: {
-   |     ^^^
+   |     -^^
 
-error: aborting due to 3 previous errors
+error: aborting due to 6 previous errors
 
diff --git a/tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr b/tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr
@@ -10,7 +10,7 @@ LL -     println!('1 + 1');
 LL +     println!("1 + 1");
    |
 
-error: lifetimes cannot start with a number
+error: lifetimes cannot start with a number: `'1`
   --> $DIR/lex-bad-str-literal-as-char-1.rs:3:14
    |
 LL |     println!('1 + 1');
diff --git a/tests/ui/parser/numeric-lifetime.stderr b/tests/ui/parser/numeric-lifetime.stderr
@@ -6,13 +6,13 @@ LL |     let x: usize = "";
    |            |
    |            expected due to this
 
-error: lifetimes cannot start with a number
+error: lifetimes cannot start with a number: `'1`
   --> $DIR/numeric-lifetime.rs:1:10
    |
 LL | struct S<'1> { s: &'1 usize }
    |          ^^
 
-error: lifetimes cannot start with a number
+error: lifetimes cannot start with a number: `'1`
   --> $DIR/numeric-lifetime.rs:1:20
    |
 LL | struct S<'1> { s: &'1 usize }

Original file line number	Diff line number	Diff line change
`@@ -4450,6 +4450,7 @@ dependencies = [`
`4450`	`4450`	`"thin-vec",`
`4451`	`4451`	`"tracing",`
`4452`	`4452`	`"unicode-normalization",`
	`4453`	`+ "unicode-properties",`
`4453`	`4454`	`"unicode-width 0.2.2",`
`4454`	`4455`	`]`
`4455`	`4456`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ LL - println!('1 + 1');`
`10`	`10`	`LL + println!("1 + 1");`
`11`	`11`	`\|`
`12`	`12`
`13`		`-error: lifetimes cannot start with a number`
	`13`	+error: lifetimes cannot start with a number: `'1`
`14`	`14`	`--> $DIR/lex-bad-str-literal-as-char-1.rs:3:14`
`15`	`15`	`\|`
`16`	`16`	`LL \| println!('1 + 1');`
Original file line number	Diff line number	Diff line change
`@@ -6,13 +6,13 @@ LL \| let x: usize = "";`
`6`	`6`	`\| \|`
`7`	`7`	`\| expected due to this`
`8`	`8`
`9`		`-error: lifetimes cannot start with a number`
	`9`	+error: lifetimes cannot start with a number: `'1`
`10`	`10`	`--> $DIR/numeric-lifetime.rs:1:10`
`11`	`11`	`\|`
`12`	`12`	`LL \| struct S<'1> { s: &'1 usize }`
`13`	`13`	`\| ^^`
`14`	`14`
`15`		`-error: lifetimes cannot start with a number`
	`15`	+error: lifetimes cannot start with a number: `'1`
`16`	`16`	`--> $DIR/numeric-lifetime.rs:1:20`
`17`	`17`	`\|`
`18`	`18`	`LL \| struct S<'1> { s: &'1 usize }`