Skip to content

Commit 5c67961

Browse files
committed
Rewrite rustc_span::symbol::Interner to avoid double hashing
Involves resorting to raw `HashTable` and writing an ad-hoc `IndexMap`-like structure, as we cannot get access to raw hashes otherwise. My local cachegrind profile shows ~ -20_000_000 Ir
1 parent ddc1a64 commit 5c67961

2 files changed

Lines changed: 65 additions & 34 deletions

File tree

compiler/rustc_span/src/symbol.rs

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
//! allows bidirectional lookup; i.e., given a value, one can easily find the
33
//! type, and vice versa.
44
5-
use std::hash::{Hash, Hasher};
5+
use std::hash::{BuildHasher, Hash, Hasher};
66
use std::{fmt, str};
77

88
use rustc_arena::DroplessArena;
9-
use rustc_data_structures::fx::{FxHashSet, FxIndexSet};
9+
use rustc_data_structures::fx::FxBuildHasher;
10+
use rustc_data_structures::hash_table::{Entry, HashTable};
1011
use rustc_data_structures::stable_hash::{StableCompare, StableHash, StableHashCtxt, StableHasher};
1112
use rustc_data_structures::sync::Lock;
1213
use rustc_macros::{Decodable, Encodable, StableHash, symbols};
@@ -2712,32 +2713,46 @@ pub(crate) struct Interner(Lock<InternerInner>);
27122713
// between `Interner`s.
27132714
struct InternerInner {
27142715
arena: DroplessArena,
2715-
byte_strs: FxIndexSet<&'static [u8]>,
2716+
byte_strs: HashTable<(&'static [u8], u32)>,
2717+
ids: Vec<&'static [u8]>,
27162718
}
27172719

27182720
impl Interner {
27192721
// These arguments are `&str`, but because of the sharing, we are
27202722
// effectively pre-interning all these strings for both `Symbol` and
27212723
// `ByteSymbol`.
27222724
fn prefill(init: &[&'static str], extra: &[&'static str]) -> Self {
2723-
let byte_strs = FxIndexSet::from_iter(
2724-
init.iter().copied().chain(extra.iter().copied()).map(|str| str.as_bytes()),
2725-
);
2725+
let values = init.iter().copied().chain(extra.iter().copied()).map(|str| str.as_bytes());
2726+
let (size_hint, _) = values.size_hint();
2727+
let mut conflicting_values: Vec<&[u8]> = Vec::new();
27262728

2727-
// The order in which duplicates are reported is irrelevant.
2728-
#[expect(rustc::potential_query_instability)]
2729-
if byte_strs.len() != init.len() + extra.len() {
2729+
let mut byte_strs: HashTable<(&'static [u8], u32)> = HashTable::with_capacity(size_hint);
2730+
let hasher = FxBuildHasher::default();
2731+
let hasher_func = |x: &(_, _)| hasher.hash_one(x.0);
2732+
2733+
let mut ids: Vec<&'static [u8]> = Vec::with_capacity(size_hint);
2734+
2735+
for v in values {
2736+
let entry = byte_strs.entry(hasher.hash_one(&v), |&x| x.0 == v, hasher_func);
2737+
match entry {
2738+
Entry::Occupied(v) => {
2739+
conflicting_values.push(v.get().0);
2740+
}
2741+
Entry::Vacant(view) => {
2742+
view.insert((v, ids.len() as u32));
2743+
ids.push(v);
2744+
}
2745+
}
2746+
}
2747+
2748+
if conflicting_values.len() != 0 {
27302749
panic!(
27312750
"duplicate symbols in the rustc symbol list and the extra symbols added by the driver: {:?}",
2732-
FxHashSet::intersection(
2733-
&init.iter().copied().collect(),
2734-
&extra.iter().copied().collect(),
2735-
)
2736-
.collect::<Vec<_>>()
2751+
conflicting_values
27372752
)
27382753
}
27392754

2740-
Interner(Lock::new(InternerInner { arena: Default::default(), byte_strs }))
2755+
Interner(Lock::new(InternerInner { arena: Default::default(), byte_strs, ids }))
27412756
}
27422757

27432758
fn intern_str(&self, str: &str) -> Symbol {
@@ -2750,24 +2765,29 @@ impl Interner {
27502765

27512766
#[inline]
27522767
fn intern_inner(&self, byte_str: &[u8]) -> u32 {
2753-
let mut inner = self.0.lock();
2754-
if let Some(idx) = inner.byte_strs.get_index_of(byte_str) {
2755-
return idx as u32;
2756-
}
2757-
2758-
let byte_str: &[u8] = inner.arena.alloc_slice(byte_str);
2759-
2760-
// SAFETY: we can extend the arena allocation to `'static` because we
2761-
// only access these while the arena is still alive.
2762-
let byte_str: &'static [u8] = unsafe { &*(byte_str as *const [u8]) };
2763-
2764-
// This second hash table lookup can be avoided by using `RawEntryMut`,
2765-
// but this code path isn't hot enough for it to be worth it. See
2766-
// #91445 for details.
2767-
let (idx, is_new) = inner.byte_strs.insert_full(byte_str);
2768-
debug_assert!(is_new); // due to the get_index_of check above
2769-
2770-
idx as u32
2768+
let hasher = FxBuildHasher::default();
2769+
let hasher_func = |x: &(_, _)| hasher.hash_one(x.0);
2770+
let hash_of_byte_str = hasher.hash_one(byte_str);
2771+
2772+
self.0.with_lock(|inner| {
2773+
let entry = inner.byte_strs.entry(hash_of_byte_str, |&x| x.0 == byte_str, hasher_func);
2774+
match entry {
2775+
Entry::Occupied(v) => {
2776+
return v.get().1;
2777+
}
2778+
Entry::Vacant(view) => {
2779+
let byte_str: &[u8] = inner.arena.alloc_slice(byte_str);
2780+
2781+
// SAFETY: we can extend the arena allocation to `'static` because we
2782+
// only access these while the arena is still alive.
2783+
let byte_str: &'static [u8] = unsafe { &*(byte_str as *const [u8]) };
2784+
let idx = inner.ids.len() as u32;
2785+
view.insert((byte_str, idx));
2786+
inner.ids.push(byte_str);
2787+
return idx;
2788+
}
2789+
};
2790+
})
27712791
}
27722792

27732793
/// Get the symbol as a string.
@@ -2787,7 +2807,7 @@ impl Interner {
27872807
}
27882808

27892809
fn get_inner(&self, index: usize) -> &[u8] {
2790-
self.0.lock().byte_strs.get_index(index).unwrap()
2810+
self.0.with_lock(|inner| inner.ids[index])
27912811
}
27922812
}
27932813

compiler/rustc_span/src/symbol/tests.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@ fn interner_tests() {
1515
assert_eq!(i.intern_str("dog"), Symbol::new(0));
1616
}
1717

18+
#[test]
19+
fn interner_get() {
20+
let i = Interner::prefill(&["chicken"], &["cow"]);
21+
let dog_idx = i.intern_str("dog"); // 2
22+
let cat_idx = i.intern_str("cat"); // 3
23+
assert_eq!(i.get_str(Symbol::new(0)), "chicken");
24+
assert_eq!(i.get_str(Symbol::new(1)), "cow");
25+
assert_eq!(i.get_str(cat_idx), "cat");
26+
assert_eq!(i.get_str(dog_idx), "dog");
27+
}
28+
1829
#[test]
1930
fn without_first_quote_test() {
2031
create_default_session_globals_then(|| {

0 commit comments

Comments
 (0)