Improve CostHashAgg with single-column NDV and spill-aware cost model

yjhjstz · yjhjstz · commit d6a7e60d9819 · 2026-04-10T05:33:59.000+08:00
Two enhancements to CostHashAgg in CCostModelGPDB:

1. Single-column NDV optimization for local partial HashAgg:
   When GROUP BY has exactly 1 column, use GetNDVs() (global NDV from
   column statistics) instead of pci-&gt;Rows() to estimate the output
   row count of the local partial aggregation stage. GetNDVs() returns
   the global NDV directly, so no * UlHosts() scaling is needed.

   This lets the optimizer distinguish high-NDV cases (partial agg
   streams nearly as many rows as input, 2-phase has little benefit)
   from low-NDV cases (partial agg significantly reduces data before
   redistribution, 2-phase is preferred).

   Multi-column GROUP BY falls back to the original behavior:
   num_output_rows = pci-&gt;Rows() * UlHosts().

2. Spill-aware cost model:
   When num_output_rows * width exceeds the spilling memory threshold
   (EcpHJSpillingMemThreshold, 50 MB), apply higher cost unit values
   to reflect disk I/O overhead. Uses the existing HJ spilling cost
   parameters (EcpHJFeedingTupColumnSpillingCostUnit etc.) which are
   already tuned for spilling scenarios.

TPC-H benchmark: -14.3% overall (Q17 -60%, Q03 -6%).
TPC-DS benchmark: -0.4% overall (Q59 -29%).
diff --git a/src/backend/gporca/libgpdbcost/src/CCostModelGPDB.cpp b/src/backend/gporca/libgpdbcost/src/CCostModelGPDB.cpp
@@ -832,7 +832,35 @@ CCostModelGPDB::CostHashAgg(CMemoryPool *mp, CExpressionHandle &exprhdl,
 	if ((COperator::EgbaggtypeLocal == popAgg->Egbaggtype()) &&
 		popAgg->FGeneratesDuplicates())
 	{
-		num_output_rows = num_output_rows * pcmgpdb->UlHosts();
+		// Use NDV of grouping columns from child statistics to estimate the
+		// actual output rows of local partial aggregation, rather than relying
+		// solely on GPORCA's cardinality estimate (which can be inflated after
+		// multi-table joins).
+		//
+		// The local partial agg's output is bounded by the NDV of its grouping
+		// key. GetNDVs() returns the global NDV (total across all segments),
+		// so num_output_rows = global NDV, capped at global input rows.
+		//
+		// This lets the optimizer distinguish:
+		//   - High NDV (≈ input rows): partial agg streams nearly as many rows
+		//     as input → little benefit, cost approaches 1-phase.
+		//   - Low NDV (<<< input rows): partial agg significantly reduces data
+		//     before redistribution → 2-phase preferred.
+		const CColRefArray *pdrgpcrGrpCols = popAgg->PdrgpcrGroupingCols();
+		if (pci->Pcstats(0) != nullptr && pdrgpcrGrpCols->Size() == 1)
+		{
+			CColRef *colref = (*pdrgpcrGrpCols)[0];
+			CDouble ndv = pci->Pcstats(0)->GetNDVs(colref);
+
+			num_output_rows =
+				std::max(1.0,
+						 std::min(ndv.Get(),
+								  num_input_rows * pcmgpdb->UlHosts()));
+		}
+		else
+		{
+			num_output_rows = num_output_rows * pcmgpdb->UlHosts();
+		}
 	}
 
 	// get the number of grouping columns
@@ -852,9 +880,32 @@ CCostModelGPDB::CostHashAgg(CMemoryPool *mp, CExpressionHandle &exprhdl,
 		pcmgpdb->GetCostModelParams()
 			->PcpLookup(CCostModelParamsGPDB::EcpHashAggOutputTupWidthCostUnit)
 			->Get();
+	const CDouble dHashAggSpillingMemThreshold =
+		pcmgpdb->GetCostModelParams()
+			->PcpLookup(CCostModelParamsGPDB::EcpHJSpillingMemThreshold)
+			->Get();
+	const CDouble dHashAggInputTupColumnSpillingCostUnit =
+		pcmgpdb->GetCostModelParams()
+			->PcpLookup(
+				CCostModelParamsGPDB::EcpHJFeedingTupColumnSpillingCostUnit)
+			->Get();
+	const CDouble dHashAggInputTupWidthSpillingCostUnit =
+		pcmgpdb->GetCostModelParams()
+			->PcpLookup(
+				CCostModelParamsGPDB::EcpHJFeedingTupWidthSpillingCostUnit)
+			->Get();
+	const CDouble dHashAggOutputTupWidthSpillingCostUnit =
+		pcmgpdb->GetCostModelParams()
+			->PcpLookup(
+				CCostModelParamsGPDB::EcpHJHashingTupWidthSpillingCostUnit)
+			->Get();
 	GPOS_ASSERT(0 < dHashAggInputTupColumnCostUnit);
 	GPOS_ASSERT(0 < dHashAggInputTupWidthCostUnit);
 	GPOS_ASSERT(0 < dHashAggOutputTupWidthCostUnit);
+	GPOS_ASSERT(0 < dHashAggSpillingMemThreshold);
+	GPOS_ASSERT(0 < dHashAggInputTupColumnSpillingCostUnit);
+	GPOS_ASSERT(0 < dHashAggInputTupWidthSpillingCostUnit);
+	GPOS_ASSERT(0 < dHashAggOutputTupWidthSpillingCostUnit);
 
 	// hashAgg cost contains three parts: build hash table, aggregate tuples, and output tuples.
 	// 1. build hash table is correlated with the number of num_input_rows
@@ -863,12 +914,34 @@ CCostModelGPDB::CostHashAgg(CMemoryPool *mp, CExpressionHandle &exprhdl,
 	// algorithm and thus is ignored.
 	// 3. cost of output tuples is correlated with num_output_rows and
 	// width of returning tuples.
-	CCost costLocal = CCost(
-		pci->NumRebinds() *
-		(num_input_rows * ulGrpCols * dHashAggInputTupColumnCostUnit +
-		 num_input_rows * ulGrpCols * pci->Width() *
-			 dHashAggInputTupWidthCostUnit +
-		 num_output_rows * pci->Width() * dHashAggOutputTupWidthCostUnit));
+	//
+	// The hash table holds one entry per distinct group, so its memory
+	// footprint is approximately num_output_rows * width.  When this
+	// exceeds the spilling threshold the aggregator writes batches to disk
+	// and re-reads them, which is reflected by higher cost unit values.
+	CCost costLocal(0);
+	if (num_output_rows * pci->Width() <= dHashAggSpillingMemThreshold)
+	{
+		// groups fit in memory
+		costLocal = CCost(
+			pci->NumRebinds() *
+			(num_input_rows * ulGrpCols * dHashAggInputTupColumnCostUnit +
+			 num_input_rows * ulGrpCols * pci->Width() *
+				 dHashAggInputTupWidthCostUnit +
+			 num_output_rows * pci->Width() * dHashAggOutputTupWidthCostUnit));
+	}
+	else
+	{
+		// groups spill to disk
+		costLocal = CCost(
+			pci->NumRebinds() *
+			(num_input_rows * ulGrpCols *
+				 dHashAggInputTupColumnSpillingCostUnit +
+			 num_input_rows * ulGrpCols * pci->Width() *
+				 dHashAggInputTupWidthSpillingCostUnit +
+			 num_output_rows * pci->Width() *
+				 dHashAggOutputTupWidthSpillingCostUnit));
+	}
 	CCost costChild =
 		CostChildren(mp, exprhdl, pci, pcmgpdb->GetCostModelParams());