Skip to content

Commit b9df189

Browse files
committed
Support parsing doubly quoted strings as identifiers
Per https://www.sqlite.org/lang_keywords.html, SQLite supports identifiers expressed as strings surrounded between double quotes: $ sqlite3 SQLite version 3.52.0 2026-03-06 16:01:44 Enter ".help" for usage hints. Connected to a transient in-memory database. Use ".open FILENAME" to reopen on a persistent database. sqlite> create table a("oh,boy!" TEXT); sqlite> pragma table_info(a); ╭─────┬─────────┬──────┬─────────┬────────────┬────╮ │ cid │ name │ type │ notnull │ dflt_value │ pk │ ╞═════╪═════════╪══════╪═════════╪════════════╪════╡ │ 0 │ oh,boy! │ TEXT │ 0 │ NULL │ 0 │ ╰─────┴─────────┴──────┴─────────┴────────────┴────╯ When identifiers are parsed as such, no escaping applies, i.e. backslashes don't affect the next character: sqlite> create table b("oh\tno\" TEXT); sqlite> pragma table_info(b); ╭─────┬─────────┬──────┬─────────┬────────────┬────╮ │ cid │ name │ type │ notnull │ dflt_value │ pk │ ╞═════╪═════════╪══════╪═════════╪════════════╪════╡ │ 0 │ oh\tno\ │ TEXT │ 0 │ NULL │ 0 │ ╰─────┴─────────┴──────┴─────────┴────────────┴────╯ Make sqlite-vec internal scanner handle these identifiers, so that the column names in a vec0 virtual table may have the same (reduced) limitations as any other SQLite table. Note: SQLite also implements similar support for single quoted strings to be handled as identifiers. This is done for compatibility with other SQL implementations rather than standard conformance, so it's perhaps a bit more dubious to support.
1 parent 5778fec commit b9df189

1 file changed

Lines changed: 57 additions & 15 deletions

File tree

sqlite-vec.c

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2072,6 +2072,7 @@ static void _static_text_func(sqlite3_context *context, int argc,
20722072

20732073
enum Vec0TokenType {
20742074
TOKEN_TYPE_IDENTIFIER,
2075+
TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER,
20752076
TOKEN_TYPE_DIGIT,
20762077
TOKEN_TYPE_LBRACKET,
20772078
TOKEN_TYPE_RBRACKET,
@@ -2157,6 +2158,17 @@ int vec0_token_next(char *start, char *end, struct Vec0Token *out) {
21572158
out->end = ptr;
21582159
out->token_type = TOKEN_TYPE_IDENTIFIER;
21592160
return VEC0_TOKEN_RESULT_SOME;
2161+
} else if (curr == '"') {
2162+
char *start = ptr;
2163+
int match = 0;
2164+
do {
2165+
match = ptr > start && (*ptr == '"');
2166+
ptr++;
2167+
} while (ptr < end && !match);
2168+
out->start = start;
2169+
out->end = ptr;
2170+
out->token_type = TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER;
2171+
return VEC0_TOKEN_RESULT_SOME;
21602172
} else if (is_digit(curr)) {
21612173
char *start = ptr;
21622174
while (ptr < end && (is_digit(*ptr))) {
@@ -2263,12 +2275,18 @@ int vec0_parse_partition_key_definition(const char *source, int source_length,
22632275
// Check first token is identifier, will be the column name
22642276
int rc = vec0_scanner_next(&scanner, &token);
22652277
if (rc != VEC0_TOKEN_RESULT_SOME &&
2266-
token.token_type != TOKEN_TYPE_IDENTIFIER) {
2278+
!((token.token_type == TOKEN_TYPE_IDENTIFIER) ||
2279+
(token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER))) {
22672280
return SQLITE_EMPTY;
22682281
}
22692282

2270-
column_name = token.start;
2271-
column_name_length = token.end - token.start;
2283+
if (token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER) {
2284+
column_name = token.start + 1;
2285+
column_name_length = token.end - token.start - 2;
2286+
} else {
2287+
column_name = token.start;
2288+
column_name_length = token.end - token.start;
2289+
}
22722290

22732291
// Check the next token matches "text" or "integer", as column type
22742292
rc = vec0_scanner_next(&scanner, &token);
@@ -2346,12 +2364,18 @@ int vec0_parse_auxiliary_column_definition(const char *source, int source_length
23462364

23472365
rc = vec0_scanner_next(&scanner, &token);
23482366
if (rc != VEC0_TOKEN_RESULT_SOME &&
2349-
token.token_type != TOKEN_TYPE_IDENTIFIER) {
2367+
!((token.token_type == TOKEN_TYPE_IDENTIFIER) ||
2368+
(token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER))) {
23502369
return SQLITE_EMPTY;
23512370
}
23522371

2353-
column_name = token.start;
2354-
column_name_length = token.end - token.start;
2372+
if (token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER) {
2373+
column_name = token.start + 1;
2374+
column_name_length = token.end - token.start - 2;
2375+
} else {
2376+
column_name = token.start;
2377+
column_name_length = token.end - token.start;
2378+
}
23552379

23562380
// Check the next token matches "text" or "integer", as column type
23572381
rc = vec0_scanner_next(&scanner, &token);
@@ -2418,12 +2442,18 @@ int vec0_parse_metadata_column_definition(const char *source, int source_length,
24182442

24192443
rc = vec0_scanner_next(&scanner, &token);
24202444
if (rc != VEC0_TOKEN_RESULT_SOME ||
2421-
token.token_type != TOKEN_TYPE_IDENTIFIER) {
2445+
!((token.token_type == TOKEN_TYPE_IDENTIFIER) ||
2446+
(token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER))) {
24222447
return SQLITE_EMPTY;
24232448
}
24242449

2425-
column_name = token.start;
2426-
column_name_length = token.end - token.start;
2450+
if (token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER) {
2451+
column_name = token.start + 1;
2452+
column_name_length = token.end - token.start - 2;
2453+
} else {
2454+
column_name = token.start;
2455+
column_name_length = token.end - token.start;
2456+
}
24272457

24282458
// Check the next token matches a valid metadata type
24292459
rc = vec0_scanner_next(&scanner, &token);
@@ -2478,12 +2508,18 @@ int vec0_parse_primary_key_definition(const char *source, int source_length,
24782508
// Check first token is identifier, will be the column name
24792509
int rc = vec0_scanner_next(&scanner, &token);
24802510
if (rc != VEC0_TOKEN_RESULT_SOME &&
2481-
token.token_type != TOKEN_TYPE_IDENTIFIER) {
2511+
!((token.token_type == TOKEN_TYPE_IDENTIFIER) ||
2512+
(token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER))) {
24822513
return SQLITE_EMPTY;
24832514
}
24842515

2485-
column_name = token.start;
2486-
column_name_length = token.end - token.start;
2516+
if (token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER) {
2517+
column_name = token.start + 1;
2518+
column_name_length = token.end - token.start - 2;
2519+
} else {
2520+
column_name = token.start;
2521+
column_name_length = token.end - token.start;
2522+
}
24872523

24882524
// Check the next token matches "text" or "integer", as column type
24892525
rc = vec0_scanner_next(&scanner, &token);
@@ -2998,12 +3034,18 @@ int vec0_parse_vector_column(const char *source, int source_length,
29983034
rc = vec0_scanner_next(&scanner, &token);
29993035

30003036
if (rc != VEC0_TOKEN_RESULT_SOME &&
3001-
token.token_type != TOKEN_TYPE_IDENTIFIER) {
3037+
!((token.token_type == TOKEN_TYPE_IDENTIFIER) ||
3038+
(token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER))) {
30023039
return SQLITE_EMPTY;
30033040
}
30043041

3005-
name = token.start;
3006-
nameLength = token.end - token.start;
3042+
if (token.token_type == TOKEN_TYPE_DOUBLY_QUOTED_IDENTIFIER) {
3043+
name = token.start + 1;
3044+
nameLength = token.end - token.start - 2;
3045+
} else {
3046+
name = token.start;
3047+
nameLength = token.end - token.start;
3048+
}
30073049

30083050
// vector column type comes next: float, int, or bit
30093051
rc = vec0_scanner_next(&scanner, &token);

0 commit comments

Comments
 (0)