-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
MDEV-37009 bulk insert into vector index #5150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a0e7cf3
ffaa6a1
79be8c4
d1e45cc
db869c7
fcdc5f3
8d65ddb
76d9d4e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| [innodb] | ||
| innodb | ||
| default-storage-engine=innodb | ||
|
|
||
| [myisam] | ||
| default-storage-engine=myisam | ||
|
|
||
| [aria] | ||
| default-storage-engine=aria |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| # | ||
| # Test memory budget fallback during MHNSW bulk insert | ||
| # | ||
| # 1. Normal bulk insert works when memory budget is large enough | ||
| create table t1 (id int auto_increment primary key, v vector(3) not null); | ||
| insert into t1 (v) values (x'000000000000000000000000'), | ||
| (x'0000803f0000000000000000'), | ||
| (x'000000000000803f00000000'), | ||
| (x'00000000000000000000803f'); | ||
| alter table t1 add vector index (v); | ||
| show warnings; | ||
| Level Code Message | ||
| # Test search using the successfully bulk-built index | ||
| select id, vec_distance_euclidean(v, x'0000803f0000000000000000') d from t1 order by d limit 2; | ||
| id d | ||
| 2 0 | ||
| 1 1 | ||
| drop table t1; | ||
| # 2. Bulk insert falls back to normal insert when mhnsw_max_cache_size is small | ||
| set @old_cache_size= @@global.mhnsw_max_cache_size; | ||
| set global mhnsw_max_cache_size= 1048576; | ||
| set max_recursive_iterations= 300; | ||
| create table t1 (id int auto_increment primary key, v vector(3000) not null); | ||
| insert into t1 (v) | ||
| with recursive cte as ( | ||
| select 1 as n | ||
| union all | ||
| select n + 1 from cte where n < 115 | ||
| ) | ||
| select repeat(x'00', 12000) from cte; | ||
| insert into t1 (id, v) values (999, concat(repeat(x'00', 11996), x'0000803f')); | ||
| # Adding index with small cache size should trigger fallback and show a warning | ||
| alter table t1 add vector index (v) m=200; | ||
| Warnings: | ||
| Note 1105 MHNSW: Bulk insert disabled because estimated memory usage (estimated_mem) exceeds mhnsw_max_cache_size (1048576). Falling back to normal insert. | ||
| show warnings; | ||
| Level Code Message | ||
| Note 1105 MHNSW: Bulk insert disabled because estimated memory usage (estimated_mem) exceeds mhnsw_max_cache_size (1048576). Falling back to normal insert. | ||
| # Test search using the fallback-built index to ensure it is healthy and correct | ||
| select id, vec_distance_euclidean(v, concat(repeat(x'00', 11996), x'0000803f')) d from t1 order by d limit 1; | ||
| id d | ||
| 999 0 | ||
| drop table t1; | ||
| set global mhnsw_max_cache_size= @old_cache_size; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| --echo # | ||
| --echo # Test memory budget fallback during MHNSW bulk insert | ||
| --echo # | ||
|
|
||
| --echo # 1. Normal bulk insert works when memory budget is large enough | ||
| create table t1 (id int auto_increment primary key, v vector(3) not null); | ||
| insert into t1 (v) values (x'000000000000000000000000'), | ||
| (x'0000803f0000000000000000'), | ||
| (x'000000000000803f00000000'), | ||
| (x'00000000000000000000803f'); | ||
| alter table t1 add vector index (v); | ||
| show warnings; | ||
|
|
||
| --echo # Test search using the successfully bulk-built index | ||
| --replace_regex /(\.\d{5})\d+/\1/ | ||
| select id, vec_distance_euclidean(v, x'0000803f0000000000000000') d from t1 order by d limit 2; | ||
|
|
||
| drop table t1; | ||
|
|
||
| --echo # 2. Bulk insert falls back to normal insert when mhnsw_max_cache_size is small | ||
| set @old_cache_size= @@global.mhnsw_max_cache_size; | ||
| set global mhnsw_max_cache_size= 1048576; | ||
| set max_recursive_iterations= 300; | ||
|
|
||
| create table t1 (id int auto_increment primary key, v vector(3000) not null); | ||
|
|
||
| # Insert 115 rows of 3000-dimensional vectors. | ||
| # Total size: 115 * ~9288 bytes/row (with M=200) = ~1.06 MB, exceeding 1MB. | ||
| insert into t1 (v) | ||
| with recursive cte as ( | ||
| select 1 as n | ||
| union all | ||
| select n + 1 from cte where n < 115 | ||
| ) | ||
| select repeat(x'00', 12000) from cte; | ||
|
|
||
| # Insert a unique search target vector with fixed ID 999 to guarantee deterministic results across all engines | ||
| insert into t1 (id, v) values (999, concat(repeat(x'00', 11996), x'0000803f')); | ||
|
|
||
| --echo # Adding index with small cache size should trigger fallback and show a warning | ||
| --replace_regex /usage \(\d+\)/usage (estimated_mem)/ | ||
| alter table t1 add vector index (v) m=200; | ||
| --replace_regex /usage \(\d+\)/usage (estimated_mem)/ | ||
| show warnings; | ||
|
|
||
| --echo # Test search using the fallback-built index to ensure it is healthy and correct | ||
| --replace_regex /(\.\d{5})\d+/\1/ | ||
| select id, vec_distance_euclidean(v, concat(repeat(x'00', 11996), x'0000803f')) d from t1 order by d limit 1; | ||
|
|
||
| drop table t1; | ||
| set global mhnsw_max_cache_size= @old_cache_size; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10145,11 +10145,15 @@ int TABLE::unlock_hlindexes() | |
|
|
||
| int TABLE::hlindexes_on_insert() | ||
| { | ||
| DBUG_ASSERT(s->hlindexes() == (hlindex != NULL)); | ||
| if (hlindex && hlindex->in_use) | ||
| if (int err= mhnsw_insert(this, key_info + s->keys)) | ||
| return err; | ||
| return 0; | ||
| DBUG_ASSERT(s->hlindexes() == (hlindex != NULL)); | ||
| if (hlindex && hlindex->in_use) | ||
| { | ||
| if (hlindex->bulk_insert_active) | ||
| return mhnsw_bulk_insert_row(this, key_info + s->keys); | ||
| else | ||
| return mhnsw_insert(this, key_info + s->keys); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| int TABLE::hlindexes_on_update() | ||
|
|
@@ -10208,3 +10212,36 @@ int TABLE::hlindex_read_end() | |
| { | ||
| return mhnsw_read_end(this); | ||
| } | ||
|
|
||
| int TABLE::hlindexes_bulk_insert_begin(ha_rows rows) | ||
| { | ||
| if (s->hlindexes()) | ||
| { | ||
| if (!hlindex || !hlindex->in_use) | ||
| if (int err= open_hlindexes_for_write()) | ||
| return err; | ||
|
|
||
| if (hlindex && hlindex->in_use) | ||
| { | ||
| int err= mhnsw_bulk_insert_begin(this, key_info + s->keys, rows); | ||
| if (err) | ||
| { | ||
| hlindex->bulk_insert_active= false; | ||
| return err; | ||
| } | ||
| if (hlindex->context) | ||
| hlindex->bulk_insert_active= true; | ||
| } | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| int TABLE::hlindexes_bulk_insert_end() | ||
| { | ||
| if (hlindex && hlindex->in_use) | ||
| { | ||
| hlindex->bulk_insert_active= false; | ||
| return mhnsw_bulk_insert_end(this, key_info + s->keys); | ||
| } | ||
| return 0; | ||
| } | ||
|
Comment on lines
+10216
to
+10247
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are three issues here:
int TABLE::hlindexes_bulk_insert_begin(ha_rows rows)
{
if (hlindex && hlindex->in_use)
{
bulk_insert_active= true;
int err= mhnsw_bulk_insert_begin(this, key_info + s->keys, rows);
if (err)
bulk_insert_active= false;
return err;
}
return 0;
}
int TABLE::hlindexes_bulk_insert_end()
{
if (hlindex && hlindex->in_use)
{
bulk_insert_active= false;
return mhnsw_bulk_insert_end(this, key_info + s->keys);
}
return 0;
} |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12616,6 +12616,7 @@ copy_data_between_tables(THD *thd, TABLE *from, TABLE *to, | |
| bool make_unversioned= from->versioned() && !to->versioned(); | ||
| bool keep_versioned= from->versioned() && to->versioned(); | ||
| bool bulk_insert_started= 0; | ||
| bool hlindex_bulk_started= 0; | ||
| Field *to_row_start= NULL, *to_row_end= NULL, *from_row_end= NULL; | ||
| MYSQL_TIME query_start; | ||
| DBUG_ENTER("copy_data_between_tables"); | ||
|
|
@@ -12662,11 +12663,20 @@ copy_data_between_tables(THD *thd, TABLE *from, TABLE *to, | |
|
|
||
| from->file->info(HA_STATUS_VARIABLE); | ||
| to->file->extra(HA_EXTRA_PREPARE_FOR_ALTER_TABLE); | ||
| if (!to->s->long_unique_table && !to->s->hlindexes()) | ||
|
|
||
| if (!to->s->long_unique_table) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when you remove something, check in the git history why it was added. The check above was in the commit with the comment "Don't enable bulk insert when altering a table containing vector index. InnoDB can't handle situation when bulk insert is enabled for one table but disabled for another" That is, you should first do
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked and actually talked about this in Zulip :) but yeah I forgot to make sure we only enable it when hlindex_bulk_begin succeed |
||
| { | ||
| to->file->ha_start_bulk_insert(from->file->stats.records, | ||
| ignore ? 0 : HA_CREATE_UNIQUE_INDEX_BY_SORT); | ||
| bulk_insert_started= 1; | ||
| if (to->s->hlindexes()) | ||
| { | ||
| if (to->hlindexes_bulk_insert_begin(from->file->stats.records) == 0) | ||
| hlindex_bulk_started= 1; | ||
| } | ||
| if (!to->s->hlindexes() || hlindex_bulk_started) | ||
| { | ||
| to->file->ha_start_bulk_insert(from->file->stats.records, | ||
| ignore ? 0 : HA_CREATE_UNIQUE_INDEX_BY_SORT); | ||
| bulk_insert_started= 1; | ||
| } | ||
| } | ||
|
Comment on lines
+12667
to
12680
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The return value of Additionally, please update the indentation to 2 spaces to match the repository's implicit style guide. if (!to->s->long_unique_table)
{
to->file->ha_start_bulk_insert(from->file->stats.records,
ignore ? 0 : HA_CREATE_UNIQUE_INDEX_BY_SORT);
bulk_insert_started= 1;
if (to->s->hlindexes())
{
if ((error= to->hlindexes_bulk_insert_begin(from->file->stats.records)))
{
to->file->print_error(error, MYF(0));
}
else
{
hlindex_bulk_started= 1;
}
}
} |
||
| mysql_stage_set_work_estimated(thd->m_stage_progress_psi, from->file->stats.records); | ||
| List_iterator<Create_field> it(alter_info->create_list); | ||
|
|
@@ -12999,6 +13009,14 @@ copy_data_between_tables(THD *thd, TABLE *from, TABLE *to, | |
| } | ||
|
|
||
| bulk_insert_started= 0; | ||
| if (hlindex_bulk_started && to->hlindexes_bulk_insert_end() && error <= 0) | ||
| { | ||
| if (!thd->is_error()) | ||
| to->file->print_error(my_errno, MYF(0)); | ||
| error= 1; | ||
| } | ||
| hlindex_bulk_started=0; | ||
|
Comment on lines
+13012
to
+13018
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please update the indentation to 2 spaces and add proper spacing around operators (e.g., if (hlindex_bulk_started && to->hlindexes_bulk_insert_end() && error <= 0)
{
if (!thd->is_error())
to->file->print_error(my_errno, MYF(0));
error= 1;
}
hlindex_bulk_started= 0; |
||
|
|
||
| if (error <= 0 && !to->s->hlindexes()) | ||
| { | ||
| Abort_on_warning_instant_set save_abort_on_warning(thd, false); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The member
bulk_insert_activewas added to theTABLEstruct insql/table.h, but here it is accessed viahlindex->bulk_insert_active. Sincehlindexis not of typeTABLE*, this will cause a compilation error. It should be accessed directly asbulk_insert_active(orthis->bulk_insert_active).Additionally, please update the indentation to 2 spaces to match the repository's implicit style guide.