pingcap
diff --git a/‎dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp‎
Lines changed: 2 additions & 0 deletions b/‎dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎dbms/src/Operators/tests/gtest_concat_source.cpp‎
Lines changed: 159 additions & 1 deletion b/‎dbms/src/Operators/tests/gtest_concat_source.cpp‎
Lines changed: 159 additions & 1 deletion
diff --git a/‎dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp‎
Lines changed: 27 additions & 14 deletions b/‎dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp‎
Lines changed: 27 additions & 14 deletions
diff --git a/‎dbms/src/Storages/DeltaMerge/DeltaMergeStore.h‎
Lines changed: 14 additions & 9 deletions b/‎dbms/src/Storages/DeltaMerge/DeltaMergeStore.h‎
Lines changed: 14 additions & 9 deletions
@@ -921,13 +921,15 @@ std::unordered_map<TableID, SelectQueryInfo> DAGStorageInterpreter::generateSele
     RUNTIME_CHECK_MSG(mvcc_query_info->scan_context != nullptr, "Unexpected null scan_context");
     if (table_scan.isPartitionTableScan())
     {
+        bool has_multiple_partitions = table_scan.getPhysicalTableIDs().size() > 1;
         for (const auto physical_table_id : table_scan.getPhysicalTableIDs())
         {
             SelectQueryInfo query_info = create_query_info(physical_table_id);
             query_info.mvcc_query_info = std::make_unique<MvccQueryInfo>(
                 mvcc_query_info->resolve_locks,
                 mvcc_query_info->start_ts,
                 mvcc_query_info->scan_context);
+            query_info.has_multiple_partitions = has_multiple_partitions;
             ret.emplace(physical_table_id, std::move(query_info));
         }
         // Dispatch the regions_query_info to different physical table's query_info
 
@@ -12,13 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/Logger.h>
+#include <Common/Stopwatch.h>
+#include <Common/ThreadManager.h>
 #include <Flash/Executor/PipelineExecutorContext.h>
+#include <Flash/Pipeline/Schedule/Tasks/NotifyFuture.h>
 #include <Operators/ConcatSourceOp.h>
+#include <Storages/DeltaMerge/ReadThread/WorkQueue.h>
 #include <TestUtils/ColumnGenerator.h>
+#include <TestUtils/TiFlashTestBasic.h>
 #include <gtest/gtest.h>
 
 #include <memory>
-#include <type_traits>
 
 namespace DB::tests
 {
@@ -97,4 +102,157 @@ TEST_F(TestConcatSource, concatSink)
     ASSERT_EQ(actual_block_cnt, block_cnt);
 }
 
+namespace
+{
+class SyncBlocks
+{
+public:
+    explicit SyncBlocks(Blocks blks_)
+        : blocks(std::move(blks_))
+        , current(blocks.begin())
+    {}
+
+    Block getNext()
+    {
+        std::lock_guard lock(mtx);
+        if (current == blocks.end())
+            return Block{};
+        Block res = *current;
+        ++current;
+        return res;
+    }
+
+private:
+    std::mutex mtx;
+    Blocks blocks;
+    Blocks::iterator current;
+};
+
+class MockSourceOpFromQueue : public SourceOp
+{
+public:
+    MockSourceOpFromQueue(
+        PipelineExecutorContext & exec_context_,
+        const std::shared_ptr<SyncBlocks> & sync_blocks_,
+        const Block & header)
+        : SourceOp(exec_context_, "mock")
+        , sync_blocks(sync_blocks_)
+    {
+        setHeader(header);
+    }
+
+    String getName() const override { return "MockSourceOpFromQueue"; }
+
+protected:
+    OperatorStatus readImpl(Block & block) override
+    {
+        block = sync_blocks->getNext();
+        return OperatorStatus::HAS_OUTPUT;
+    }
+
+private:
+    std::shared_ptr<SyncBlocks> sync_blocks;
+};
+class SimpleGetResultSinkOp : public SinkOp
+{
+public:
+    SimpleGetResultSinkOp(PipelineExecutorContext & exec_context_, const String & req_id, ResultHandler result_handler_)
+        : SinkOp(exec_context_, req_id)
+        , result_handler(std::move(result_handler_))
+    {
+        assert(result_handler);
+    }
+
+    String getName() const override { return "SimpleGetResultSinkOp"; }
+
+protected:
+    OperatorStatus writeImpl(Block && block) override
+    {
+        if (!block)
+            return OperatorStatus::FINISHED;
+
+        result_handler(block);
+        return OperatorStatus::NEED_INPUT;
+    }
+
+private:
+    ResultHandler result_handler;
+};
+
+} // namespace
+
+TEST_F(TestConcatSource, ConcatBuilderPoolWithDifferentConcurrency)
+try
+{
+    LoggerPtr log = Logger::get();
+
+    Blocks blks{
+        Block{ColumnGenerator::instance().generate({2, "Int32", DataDistribution::RANDOM})},
+        Block{ColumnGenerator::instance().generate({2, "Int32", DataDistribution::RANDOM})},
+        Block{ColumnGenerator::instance().generate({2, "Int32", DataDistribution::RANDOM})},
+        Block{ColumnGenerator::instance().generate({2, "Int32", DataDistribution::RANDOM})},
+        Block{ColumnGenerator::instance().generate({2, "Int32", DataDistribution::RANDOM})},
+    };
+
+    Block header = blks[0].cloneEmpty();
+
+
+    PipelineExecutorContext exec_context;
+    size_t num_concurrency = 8;
+    ConcatBuilderPool builder_pool{num_concurrency};
+    // Mock that for each partition (physical table), there is a read task pool and multiple source ops reading from it.
+    size_t num_partitions = 2;
+    for (size_t idx_part = 0; idx_part < num_partitions; ++idx_part)
+    {
+        // Mock that different partitions have different concurrency.
+        size_t partition_concurrency = num_concurrency;
+        if (idx_part == 0)
+            partition_concurrency = 4;
+        else if (idx_part == 1)
+            partition_concurrency = 2;
+        // Mock a queue shared on one partition
+        auto blks_queue = std::make_shared<SyncBlocks>(blks);
+        PipelineExecGroupBuilder group_builder;
+        for (size_t i = 0; i < partition_concurrency; ++i)
+        {
+            group_builder.addConcurrency(std::make_unique<MockSourceOpFromQueue>(exec_context, blks_queue, header));
+        }
+        builder_pool.add(group_builder);
+    }
+
+    std::atomic<size_t> received_blocks = 0;
+    ResultHandler h([&](const Block & /*block*/) { received_blocks.fetch_add(1, std::memory_order_relaxed); });
+
+    PipelineExecGroupBuilder result_builder;
+    builder_pool.generate(result_builder, exec_context, "test");
+    result_builder.transform(
+        [&](auto & builder) { builder.setSinkOp(std::make_unique<SimpleGetResultSinkOp>(exec_context, "test", h)); });
+    auto op_pipeline_grp = result_builder.build(false);
+
+
+    auto mgr = newThreadPoolManager(num_concurrency);
+
+    for (const auto & pipe : op_pipeline_grp)
+    {
+        mgr->schedule(false, [&pipe, &log]() {
+            pipe->executePrefix();
+            while (true)
+            {
+                auto s = pipe->execute();
+                if (s == OperatorStatus::FINISHED)
+                {
+                    LOG_INFO(log, "ConcatPipelineExec is finished");
+                    break;
+                }
+            }
+            pipe->executeSuffix();
+        });
+    }
+    mgr->wait();
+
+    LOG_INFO(log, "ConcatPipelineExec is built and executed, received_blocks={}", received_blocks.load());
+    ASSERT_EQ(received_blocks.load(), blks.size() * num_partitions);
+}
+CATCH
+
 } // namespace DB::tests
@@ -1250,8 +1250,7 @@ BlockInputStreams DeltaMergeStore::read(
     const RuntimeFilteList & runtime_filter_list,
     int rf_max_wait_time_ms,
     const String & tracing_id,
-    bool keep_order,
-    bool is_fast_scan,
+    const DMReadOptions & read_opts,
     size_t expected_block_size,
     const SegmentIdSet & read_segments,
     size_t extra_table_id_index,
@@ -1261,7 +1260,7 @@ BlockInputStreams DeltaMergeStore::read(
     auto dm_context = newDMContext(db_context, db_settings, tracing_id, scan_context);
 
     // If keep order is required, disable read thread.
-    auto enable_read_thread = db_context.getSettingsRef().dt_enable_read_thread && !keep_order;
+    auto enable_read_thread = db_context.getSettingsRef().dt_enable_read_thread && !read_opts.keep_order;
     // SegmentReadTaskScheduler and SegmentReadTaskPool use table_id + segment id as unique ID when read thread is enabled.
     // 'try_split_task' can result in several read tasks with the same id that can cause some trouble.
     // Also, too many read tasks of a segment with different small ranges is not good for data sharing cache.
@@ -1281,7 +1280,7 @@ BlockInputStreams DeltaMergeStore::read(
 
     GET_METRIC(tiflash_storage_read_tasks_count).Increment(tasks.size());
     size_t final_num_stream = std::max(1, std::min(num_streams, tasks.size()));
-    auto read_mode = getReadMode(db_context, is_fast_scan, keep_order, filter);
+    auto read_mode = getReadMode(db_context, read_opts.is_fast_scan, read_opts.keep_order, filter);
     const auto & final_columns_to_read = filter && filter->extra_cast ? *filter->columns_after_cast : columns_to_read;
     auto read_task_pool = std::make_shared<SegmentReadTaskPool>(
         extra_table_id_index,
@@ -1334,10 +1333,10 @@ BlockInputStreams DeltaMergeStore::read(
         "Read create stream done, keep_order={} dt_enable_read_thread={} enable_read_thread={} "
         "is_fast_scan={} is_push_down_filter_empty={} pool_id={} num_streams={} columns_to_read={} "
         "final_columns_to_read={}",
-        keep_order,
+        read_opts.keep_order,
         db_context.getSettingsRef().dt_enable_read_thread,
         enable_read_thread,
-        is_fast_scan,
+        read_opts.is_fast_scan,
         filter == nullptr || filter->before_where == nullptr,
         read_task_pool->pool_id,
         final_num_stream,
@@ -1360,8 +1359,7 @@ void DeltaMergeStore::read(
     const RuntimeFilteList & runtime_filter_list,
     int rf_max_wait_time_ms,
     const String & tracing_id,
-    bool keep_order,
-    bool is_fast_scan,
+    const DMReadOptions & read_opts,
     size_t expected_block_size,
     const SegmentIdSet & read_segments,
     size_t extra_table_id_index,
@@ -1371,7 +1369,7 @@ void DeltaMergeStore::read(
     auto dm_context = newDMContext(db_context, db_settings, tracing_id, scan_context);
 
     // If keep order is required, disable read thread.
-    auto enable_read_thread = db_context.getSettingsRef().dt_enable_read_thread && !keep_order;
+    auto enable_read_thread = db_context.getSettingsRef().dt_enable_read_thread && !read_opts.keep_order;
     // SegmentReadTaskScheduler and SegmentReadTaskPool use table_id + segment id as unique ID when read thread is enabled.
     // 'try_split_task' can result in several read tasks with the same id that can cause some trouble.
     // Also, too many read tasks of a segment with different small ranges is not good for data sharing cache.
@@ -1390,9 +1388,24 @@ void DeltaMergeStore::read(
     };
 
     GET_METRIC(tiflash_storage_read_tasks_count).Increment(tasks.size());
-    size_t final_num_stream
-        = enable_read_thread ? std::max(1, num_streams) : std::max(1, std::min(num_streams, tasks.size()));
-    auto read_mode = getReadMode(db_context, is_fast_scan, keep_order, filter);
+    size_t final_num_stream = 0;
+    if (enable_read_thread)
+    {
+        // For limited tasks size under `enable_read_thread`, too much source ops actually lead to
+        // the table scan speed can not match the compute layer speed and lead to more concurrency
+        // overhead. So we limit the final_num_stream to tasks.size() * 4 when read thread is enabled
+        // under multiple partitions.
+        if (read_opts.has_multiple_partitions)
+            final_num_stream = std::min(num_streams, tasks.size() * 4);
+        else
+            final_num_stream = num_streams;
+        final_num_stream = std::max(1, final_num_stream);
+    }
+    else
+    {
+        final_num_stream = std::max(1, std::min(num_streams, tasks.size()));
+    }
+    auto read_mode = getReadMode(db_context, read_opts.is_fast_scan, read_opts.keep_order, filter);
     const auto & final_columns_to_read = filter && filter->extra_cast ? *filter->columns_after_cast : columns_to_read;
     auto read_task_pool = std::make_shared<SegmentReadTaskPool>(
         extra_table_id_index,
@@ -1454,10 +1467,10 @@ void DeltaMergeStore::read(
         "Read create PipelineExec done, keep_order={} dt_enable_read_thread={} enable_read_thread={} "
         "is_fast_scan={} is_push_down_filter_empty={} pool_id={} num_streams={} columns_to_read={} "
         "final_columns_to_read={}",
-        keep_order,
+        read_opts.keep_order,
         db_context.getSettingsRef().dt_enable_read_thread,
         enable_read_thread,
-        is_fast_scan,
+        read_opts.is_fast_scan,
         filter == nullptr || filter->before_where == nullptr,
         read_task_pool->pool_id,
         final_num_stream,
 
@@ -198,6 +198,13 @@ using LocalIndexesStats = std::vector<LocalIndexStats>;
 class DeltaMergeStore;
 using DeltaMergeStorePtr = std::shared_ptr<DeltaMergeStore>;
 
+// TODO: merge more parameters to ReadOptions
+struct DMReadOptions
+{
+    bool keep_order = false;
+    bool is_fast_scan = false;
+    bool has_multiple_partitions = false;
+};
 class DeltaMergeStore
     : private boost::noncopyable
     , public std::enable_shared_from_this<DeltaMergeStore>
@@ -376,7 +383,7 @@ class DeltaMergeStore
 
     /// You must ensure external files are ordered and do not overlap. Otherwise exceptions will be thrown.
     /// You must ensure all of the external files are contained by the range. Otherwise exceptions will be thrown.
-    /// Return the 'ingtested bytes'.
+    /// Return the 'ingested bytes'.
     UInt64 ingestFiles(
         const Context & db_context, //
         const DB::Settings & db_settings,
@@ -468,8 +475,7 @@ class DeltaMergeStore
         const RuntimeFilteList & runtime_filter_list,
         int rf_max_wait_time_ms,
         const String & tracing_id,
-        bool keep_order,
-        bool is_fast_scan = false,
+        const DMReadOptions & read_opts = {},
         size_t expected_block_size = DEFAULT_BLOCK_SIZE,
         const SegmentIdSet & read_segments = {},
         size_t extra_table_id_index = InvalidColumnID,
@@ -493,8 +499,7 @@ class DeltaMergeStore
         const RuntimeFilteList & runtime_filter_list,
         int rf_max_wait_time_ms,
         const String & tracing_id,
-        bool keep_order,
-        bool is_fast_scan = false,
+        const DMReadOptions & read_opts = {},
         size_t expected_block_size = DEFAULT_BLOCK_SIZE,
         const SegmentIdSet & read_segments = {},
         size_t extra_table_id_index = InvalidColumnID,
@@ -633,7 +638,7 @@ class DeltaMergeStore
     void waitForDeleteRange(const DMContextPtr & context, const SegmentPtr & segment);
 
     /// Should be called after every write into DeltaMergeStore.
-    /// If the delta cache reaches the foreground flush limit, it will also trigger a KVStore flush of releated regions,
+    /// If the delta cache reaches the foreground flush limit, it will also trigger a KVStore flush of related regions,
     /// by returning a non-empty DM::WriteResult.
     // Deferencing `Iter` can get a pointer to a Segment.
     template <typename Iter>
@@ -848,18 +853,18 @@ class DeltaMergeStore
 private:
     /**
       * Remove the segment from the store's memory structure.
-      * Not protected by lock, should accquire lock before calling this function.
+      * Not protected by lock, should acquire lock before calling this function.
       */
     void removeSegment(std::unique_lock<std::shared_mutex> &, const SegmentPtr & segment);
     /**
       * Add the segment to the store's memory structure.
-      * Not protected by lock, should accquire lock before calling this function.
+      * Not protected by lock, should acquire lock before calling this function.
       */
     void addSegment(std::unique_lock<std::shared_mutex> &, const SegmentPtr & segment);
     /**
       * Replace the old segment with the new segment in the store's memory structure.
       * New segment should have the same segment id as the old segment.
-      * Not protected by lock, should accquire lock before calling this function.
+      * Not protected by lock, should acquire lock before calling this function.
       */
     void replaceSegment(
         std::unique_lock<std::shared_mutex> &,
Original file line number	Diff line number	Diff line change
`@@ -921,13 +921,15 @@ std::unordered_map<TableID, SelectQueryInfo> DAGStorageInterpreter::generateSele`
`921`	`921`	`RUNTIME_CHECK_MSG(mvcc_query_info->scan_context != nullptr, "Unexpected null scan_context");`
`922`	`922`	`if (table_scan.isPartitionTableScan())`
`923`	`923`	`{`
	`924`	`+ bool has_multiple_partitions = table_scan.getPhysicalTableIDs().size() > 1;`
`924`	`925`	`for (const auto physical_table_id : table_scan.getPhysicalTableIDs())`
`925`	`926`	`{`
`926`	`927`	`SelectQueryInfo query_info = create_query_info(physical_table_id);`
`927`	`928`	`query_info.mvcc_query_info = std::make_unique<MvccQueryInfo>(`
`928`	`929`	`mvcc_query_info->resolve_locks,`
`929`	`930`	`mvcc_query_info->start_ts,`
`930`	`931`	`mvcc_query_info->scan_context);`
	`932`	`+ query_info.has_multiple_partitions = has_multiple_partitions;`
`931`	`933`	`ret.emplace(physical_table_id, std::move(query_info));`
`932`	`934`	`}`
`933`	`935`	`// Dispatch the regions_query_info to different physical table's query_info`