SpacetimeDB/crates/schema/src/identifier.rs at 4fb542f332edb87e8d9c91b115d7b9ce0940bca5 · clockworklabs/SpacetimeDB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
use crate::error::IdentifierError;
use spacetimedb_data_structures::map::{Equivalent, HashSet};
use spacetimedb_sats::raw_identifier::RawIdentifier;
use spacetimedb_sats::{impl_deserialize, impl_serialize, impl_st};
use std::fmt::{self, Debug, Display};
use std::ops::Deref;
use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_normalization::UnicodeNormalization;

lazy_static::lazy_static! {
    /// TODO(1.0): Pull in the rest of the reserved identifiers from the Identifier Proposal once that's merged.
    static ref RESERVED_IDENTIFIERS: HashSet<&'static str> = include_str!("reserved_identifiers.txt").lines().collect();
}

/// A valid SpacetimeDB Identifier.
///
/// Identifiers must be normalized according to [Unicode Standard Annex 15](https://www.unicode.org/reports/tr15/), normalization form C
/// (Canonical Decomposition followed by Canonical Composition).
/// Following Rust, we use the identifier rules defined by [Unicode Standard Annex 31](https://www.unicode.org/reports/tr31/tr31-37.html) to validate identifiers.
/// We allow underscores as well as any XID_Start character to start an identifier.
///
/// In addition, we forbid the use of any identifier reserved by [PostgreSQL](https://www.postgresql.org/docs/current/sql-keywords-appendix.html).
/// Any string that is converted into a reserved word by the Rust function
/// [`String::to_uppercase`](https://doc.rust-lang.org/std/string/struct.String.html#method.to_uppercase) will be rejected.
///
/// The list of reserved words can be found in the file `SpacetimeDB/crates/sats/db/reserved_identifiers.txt`.
///
/// Internally, this is just a raw identifier with some validation on construction.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Identifier {
    id: RawIdentifier,
}

impl_st!([] Identifier, ts => RawIdentifier::make_type(ts));
impl_serialize!([] Identifier, (self, ser) => ser.serialize_str(&self.id));
impl_deserialize!([] Identifier, de => RawIdentifier::deserialize(de).map(Self::new_assume_valid));

impl Identifier {
    /// Returns a new identifier without validating the input.
    pub fn new_assume_valid(name: RawIdentifier) -> Self {
        Self { id: name }
    }

    /// Validates that the input string is a valid identifier.
    ///
    /// Currently, this rejects non-canonicalized identifiers.
    /// Eventually, it will be changed to canonicalize the input string.
    pub fn new(name: RawIdentifier) -> Result<Self, IdentifierError> {
        if name.is_empty() {
            return Err(IdentifierError::Empty {});
        }

        // Convert to Unicode Normalization Form C (canonical decomposition followed by composition).
        if name.nfc().zip(name.chars()).any(|(a, b)| a != b) {
            return Err(IdentifierError::NotCanonicalized { name });
        }

        let mut chars = name.chars();

        let start = chars.next().ok_or(IdentifierError::Empty {})?;
        if !is_xid_start(start) && start != '_' {
            return Err(IdentifierError::InvalidStart {
                name,
                invalid_start: start,
            });
        }

        for char_ in chars {
            if !is_xid_continue(char_) {
                return Err(IdentifierError::InvalidContinue {
                    name,
                    invalid_continue: char_,
                });
            }
        }

        if Identifier::is_reserved(&name) {
            return Err(IdentifierError::Reserved { name });
        }

        Ok(Identifier { id: name })
    }

    pub fn for_test(name: impl AsRef<str>) -> Self {
        Identifier::new(RawIdentifier::new(name.as_ref())).unwrap()
    }

    /// Returns the raw identifier of this identifier.
    pub fn as_raw(&self) -> &RawIdentifier {
        &self.id
    }

    /// Check if a string is a reserved identifier.
    pub fn is_reserved(name: &str) -> bool {
        RESERVED_IDENTIFIERS.contains(&*name.to_uppercase())
    }
}

impl Debug for Identifier {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        Debug::fmt(&self.id, f)
    }
}

impl Display for Identifier {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        Display::fmt(&self.id, f)
    }
}

impl Deref for Identifier {
    type Target = str;

    fn deref(&self) -> &str {
        &self.id
    }
}

impl Equivalent<Identifier> for str {
    fn equivalent(&self, other: &Identifier) -> bool {
        self == &other.id[..]
    }
}

impl From<Identifier> for RawIdentifier {
    fn from(id: Identifier) -> Self {
        id.id
    }
}

/// An identifier that allows dot or slash separators between otherwise-normal identifier segments.
///
/// Used for fully-qualified names of mounted module items, e.g.:
/// - `"lib.library_table"` (table name)
/// - `"lib.library_table_id_idx_btree"` (index name)
/// - `"lib/library_reducer"` (reducer name)
///
/// Root-level items use their plain name with no separator (e.g., `"user"`).
/// Construction from known-valid components should use `new_assume_valid`.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct NamespacedIdentifier(Box<str>);

impl NamespacedIdentifier {
    /// Construct without validation. Use when building from known-valid `Identifier` segments
    /// (e.g., `format!("{}{}", prefix, &*identifier)`).
    pub fn new_assume_valid(s: impl Into<Box<str>>) -> Self {
        Self(s.into())
    }

    /// Validated construction: each segment (split on `.` and `/`) must satisfy XID rules.
    pub fn new(s: impl Into<Box<str>>) -> Result<Self, IdentifierError> {
        let s = s.into();
        for segment in s.split(['.', '/']) {
            Identifier::new(RawIdentifier::new(segment))?;
        }
        Ok(Self(s))
    }
}

impl std::fmt::Debug for NamespacedIdentifier {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}", &*self.0)
    }
}

impl std::fmt::Display for NamespacedIdentifier {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(&self.0)
    }
}

impl std::ops::Deref for NamespacedIdentifier {
    type Target = str;
    fn deref(&self) -> &str {
        &self.0
    }
}

impl AsRef<str> for NamespacedIdentifier {
    fn as_ref(&self) -> &str {
        &self.0
    }
}

impl From<NamespacedIdentifier> for Box<str> {
    fn from(id: NamespacedIdentifier) -> Self {
        id.0
    }
}

impl From<&str> for NamespacedIdentifier {
    fn from(s: &str) -> Self {
        Self::new_assume_valid(s)
    }
}

impl From<String> for NamespacedIdentifier {
    fn from(s: String) -> Self {
        Self::new_assume_valid(s)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use proptest::prelude::*;

    fn new(s: &str) -> Result<Identifier, IdentifierError> {
        Identifier::new(RawIdentifier::new(s))
    }

    #[test]
    fn test_a_bunch_of_identifiers() {
        assert!(new("friends").is_ok());
        assert!(new("Oysters").is_ok());
        assert!(new("_hello").is_ok());
        assert!(new("bananas_there_").is_ok());
        assert!(new("Москва").is_ok());
        assert!(new("東京").is_ok());
        assert!(new("bees123").is_ok());

        assert!(new("").is_err());
        assert!(new("123bees").is_err());
        assert!(new("\u{200B}hello").is_err()); // zero-width space
        assert!(new(" hello").is_err());
        assert!(new("hello ").is_err());
        assert!(new("🍌").is_err()); // ;-; the unicode committee is no fun
        assert!(new("").is_err());
    }

    #[test]
    fn test_canonicalization() {
        assert!(new("_\u{0041}\u{030A}").is_err());
        // canonicalized version of the above.
        assert!(new("_\u{00C5}").is_ok());
    }

    proptest! {
        #[test]
        fn test_standard_ascii_identifiers(s in "[a-zA-Z_][a-zA-Z0-9_]*") {
            // Ha! Proptest will reliably find these.
            prop_assume!(!Identifier::is_reserved(&s));

            prop_assert!(new(&s).is_ok());
        }
    }
}