Skip to content

Commit 7006d51

Browse files
SmartKeyerrorreshke
authored andcommitted
fix incorrect scan position during bitmap index words scan (#13479)
Since this PR #11377 has fixed bitmap index scan when concurrent insert update full bitmap page, which resolved concurrent read on bitmap index when there's insert running in the backend may cause the bitmap scan read wrong tid: 1. Query on bitmap: A query starts and reads all bitmap pages to PAGE_FULL, increases the next tid to fetch, and releases the lock after reading each page. 2. Concurrent insert: insert a tid into PAGE_FULL cause expand compressed words to new words, and rearrange words into PAGE_NEXT. 3. Query on bitmap: fetch PAGE_NEXT and expect the first tid in it should equal the saved next tid. But actually PAGE_NEXT now contains words used to belong in PAGE_FULL. This causes the real next tid less than the expected next tid. But our scan keeps increasing the wrong tid. And then this leads to a wrong result. The PR used _bitmap_catchup_to_next_tid() function to adjust result->lastScanWordNo to the correct position if there has concurrent read/write and causes rearranging words into the next bitmap index page, it used the value of words->firstTid and result->nextTid to judge whether rearranging words into the next bitmap index page happened or not, this is not entirely right. This is because BMIterateResult can only store 16*1024=16384 TIDs, but BMBatchWords can save 3968 words by default, a words is 64bit, even in the worst case (all words not compressed) , BMBatchWords can hold 3968 * 64 = 253952 TIDs. So if there has a PAGE_FULL, the PAGE_FULL must scan it more than one time, but the value of BMBatchWords->firstTid will not be updated during each scan, it will only be updated when new bitmap index pages are read. Therefore, in the absence of concurrent read/write, if we need to scan the same BMBatchWords multiple times, it will lead to the wrong scan position, resulting in wrong output results, as #13446. To summarize, we just need to check for a rearranged condition when the new bitmap index page is read from disk, we should do nothing when we scan the same BMBatchWords.
1 parent 94bcb50 commit 7006d51

5 files changed

Lines changed: 80 additions & 7 deletions

File tree

src/backend/access/bitmap/bitmap.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,9 @@ copy_scan_desc(IndexScanDesc scan)
890890
*
891891
* If newentry is false, we're calling the function with a partially filled
892892
* page table entry. Otherwise, the entry is empty.
893+
*
894+
* This function is only used in stream bitmap scan, more specifically, it's
895+
* BitmapIndexScan + BitmapHeapScan.
893896
*/
894897

895898
static bool
@@ -960,7 +963,7 @@ words_get_match(BMBatchWords *words, BMIterateResult *result,
960963
*/
961964
if (words->firstTid < result->nextTid)
962965
{
963-
Assert(words->nwords < 1);
966+
Assert(words->nwords == 0);
964967
return false;
965968
}
966969

src/backend/access/bitmap/bitmaputil.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,23 @@ _bitmap_findnexttids(BMBatchWords *words, BMIterateResult *result,
260260

261261
result->nextTidLoc = result->numOfTids = 0;
262262

263-
_bitmap_catchup_to_next_tid(words, result);
264-
265-
Assert(words->firstTid == result->nextTid);
263+
/*
264+
* Only in the situation that there have concurrent read/write on two
265+
* adjacent bitmap index pages, and inserting a tid into PAGE_FULL cause expand
266+
* compressed words to new words, and rearrange those words into PAGE_NEXT,
267+
* and we ready to read a new page, we should adjust result-> lastScanWordNo
268+
* to the current position.
269+
*
270+
* The value of words->startNo will always be 0, this value will only used at
271+
* _bitmap_union to union a bunch of bitmaps, the union result will be stored
272+
* at words. result->lastScanWordNo indicates the location in words->cwords that
273+
* BMIterateResult will read the word next, it's start from 0, and will
274+
* self-incrementing during the scan. So if result->lastScanWordNo equals to
275+
* words->startNo, means we will scan a new bitmap index pages.
276+
*/
277+
if (result->lastScanWordNo == words->startNo &&
278+
words->firstTid < result->nextTid)
279+
_bitmap_catchup_to_next_tid(words, result);
266280

267281
while (words->nwords > 0 && result->numOfTids < maxTids && !done)
268282
{
@@ -358,10 +372,8 @@ _bitmap_findnexttids(BMBatchWords *words, BMIterateResult *result,
358372

359373
/*
360374
* _bitmap_catchup_to_next_tid - Catch up to the nextTid we need to check
361-
* from last iteration.
375+
* from last iteration, in the following cases:
362376
*
363-
* Normally words->firstTid should equal to result->nextTid. But there
364-
* are exceptions:
365377
* 1: When the concurrent insert causes bitmap items from previous full page
366378
* to spill over to current page in the window when we (the read transaction)
367379
* had released the lock on the previous page and not locked the current page.

src/test/regress/expected/bitmap_index.out

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,3 +1088,23 @@ select * from bm_test where b = 1;
10881088

10891089
-- clean up
10901090
drop table bm_test;
1091+
-- test the scenario that we need read the same batch words many times
1092+
-- more detials can be found at https://github.com/greenplum-db/gpdb/issues/13446
1093+
SET enable_seqscan = OFF;
1094+
SET enable_bitmapscan = OFF;
1095+
create table foo_13446(a int, b int);
1096+
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
1097+
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
1098+
create index idx_13446 on foo_13446 using bitmap(b);
1099+
insert into foo_13446 select 1, 1 from generate_series(0, 16384);
1100+
-- At current implementation, BMIterateResult can only store 16*1024=16384 TIDs,
1101+
-- if we have 13685 TIDs to read, it must scan same batch words twice, that's what we want
1102+
select count(*) from foo_13446 where b = 1;
1103+
count
1104+
-------
1105+
16385
1106+
(1 row)
1107+
1108+
drop table foo_13446;
1109+
SET enable_seqscan = ON;
1110+
SET enable_bitmapscan = ON;

src/test/regress/expected/bitmap_index_optimizer.out

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,3 +1099,23 @@ select * from bm_test where b = 1;
10991099

11001100
-- clean up
11011101
drop table bm_test;
1102+
-- test the scenario that we need read the same batch words many times
1103+
-- more detials can be found at https://github.com/greenplum-db/gpdb/issues/13446
1104+
SET enable_seqscan = OFF;
1105+
SET enable_bitmapscan = OFF;
1106+
create table foo_13446(a int, b int);
1107+
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
1108+
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
1109+
create index idx_13446 on foo_13446 using bitmap(b);
1110+
insert into foo_13446 select 1, 1 from generate_series(0, 16384);
1111+
-- At current implementation, BMIterateResult can only store 16*1024=16384 TIDs,
1112+
-- if we have 13685 TIDs to read, it must scan same batch words twice, that's what we want
1113+
select count(*) from foo_13446 where b = 1;
1114+
count
1115+
-------
1116+
16385
1117+
(1 row)
1118+
1119+
drop table foo_13446;
1120+
SET enable_seqscan = ON;
1121+
SET enable_bitmapscan = ON;

src/test/regress/sql/bitmap_index.sql

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,3 +473,21 @@ select * from bm_test where b = 1;
473473

474474
-- clean up
475475
drop table bm_test;
476+
477+
-- test the scenario that we need read the same batch words many times
478+
-- more detials can be found at https://github.com/greenplum-db/gpdb/issues/13446
479+
SET enable_seqscan = OFF;
480+
SET enable_bitmapscan = OFF;
481+
482+
create table foo_13446(a int, b int);
483+
create index idx_13446 on foo_13446 using bitmap(b);
484+
insert into foo_13446 select 1, 1 from generate_series(0, 16384);
485+
-- At current implementation, BMIterateResult can only store 16*1024=16384 TIDs,
486+
-- if we have 13685 TIDs to read, it must scan same batch words twice, that's what we want
487+
select count(*) from foo_13446 where b = 1;
488+
489+
drop table foo_13446;
490+
491+
SET enable_seqscan = ON;
492+
SET enable_bitmapscan = ON;
493+

0 commit comments

Comments
 (0)