Skip to content

Commit 75e34a9

Browse files
tsafinclaude
andcommitted
Phase DS-2: fix numeric dict regression + unordered_map builder lookup
Two targeted fixes to address fact-table regressions found after DS-1 dict8 encoding (store_sales -59%, catalog_sales -74%, web_sales -74%): 1. Disable Parquet auto-dict for int64/int32/float64 columns - Add make_writer_props(schema) helper in parquet_writer.cpp - Iterates schema fields; calls disable_dictionary() for numeric types - Arrow dictionary(int8,utf8) columns unaffected (identified by name) - Eliminates ScalarMemoTable<double>::GetOrInsert (was 8.85% of CPU) and ScalarMemoTable<long>::GetOrInsert (was 4.53%) from profiles - Used in all 3 WriterProperties construction sites 2. std::map → std::unordered_map for builder lookup - Add BuilderMap type alias in dsdgen_converter.hpp - Replace all 26 function signatures in dsdgen_converter.cpp - Replace create_builders/finish_batch/reset_builders in tpcds_main.cpp - O(log N) string comparison → O(1) hash per column per row Measured gains (SF=1, Parquet/SNAPPY, avg 2 runs): web_sales: 356K → 424K r/s (+19%) catalog_sales: 359K → 414K r/s (+16%) store_returns: 161K → 167K r/s (+4%) web_returns: 115K → 121K r/s (+5%) customer_demographics: 1354K → 1508K r/s (+11%) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 897a4d7 commit 75e34a9

4 files changed

Lines changed: 83 additions & 66 deletions

File tree

include/tpch/dsdgen_converter.hpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
#pragma once
22

33
#include <memory>
4-
#include <map>
4+
#include <unordered_map>
55
#include <string>
66
#include <arrow/builder.h>
77

88
namespace tpcds {
99

10+
using BuilderMap = std::unordered_map<std::string, std::shared_ptr<arrow::ArrayBuilder>>;
11+
1012
/**
1113
* Convert dsdgen C struct rows to Arrow array builders.
1214
*
@@ -20,177 +22,177 @@ namespace tpcds {
2022
*/
2123
void append_store_sales_to_builders(
2224
const void* row,
23-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
25+
BuilderMap& builders);
2426

2527
/**
2628
* Append an inventory row (W_INVENTORY_TBL*) to Arrow builders.
2729
* Schema matches DSDGenWrapper::get_schema(TableType::INVENTORY).
2830
*/
2931
void append_inventory_to_builders(
3032
const void* row,
31-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
33+
BuilderMap& builders);
3234

3335
/**
3436
* Append a catalog_sales row (W_CATALOG_SALES_TBL*) to Arrow builders.
3537
*/
3638
void append_catalog_sales_to_builders(
3739
const void* row,
38-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
40+
BuilderMap& builders);
3941

4042
/**
4143
* Append a web_sales row (W_WEB_SALES_TBL*) to Arrow builders.
4244
*/
4345
void append_web_sales_to_builders(
4446
const void* row,
45-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
47+
BuilderMap& builders);
4648

4749
/**
4850
* Append a customer row (W_CUSTOMER_TBL*) to Arrow builders.
4951
*/
5052
void append_customer_to_builders(
5153
const void* row,
52-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
54+
BuilderMap& builders);
5355

5456
/**
5557
* Append an item row (W_ITEM_TBL*) to Arrow builders.
5658
*/
5759
void append_item_to_builders(
5860
const void* row,
59-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
61+
BuilderMap& builders);
6062

6163
/**
6264
* Append a date_dim row (W_DATE_TBL*) to Arrow builders.
6365
*/
6466
void append_date_dim_to_builders(
6567
const void* row,
66-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
68+
BuilderMap& builders);
6769

6870
/**
6971
* Append a store_returns row (W_STORE_RETURNS_TBL*) to Arrow builders.
7072
*/
7173
void append_store_returns_to_builders(
7274
const void* row,
73-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
75+
BuilderMap& builders);
7476

7577
/**
7678
* Append a catalog_returns row (W_CATALOG_RETURNS_TBL*) to Arrow builders.
7779
*/
7880
void append_catalog_returns_to_builders(
7981
const void* row,
80-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
82+
BuilderMap& builders);
8183

8284
/**
8385
* Append a web_returns row (W_WEB_RETURNS_TBL*) to Arrow builders.
8486
*/
8587
void append_web_returns_to_builders(
8688
const void* row,
87-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
89+
BuilderMap& builders);
8890

8991
/**
9092
* Append a call_center row (CALL_CENTER_TBL*) to Arrow builders.
9193
*/
9294
void append_call_center_to_builders(
9395
const void* row,
94-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
96+
BuilderMap& builders);
9597

9698
/**
9799
* Append a catalog_page row (CATALOG_PAGE_TBL*) to Arrow builders.
98100
*/
99101
void append_catalog_page_to_builders(
100102
const void* row,
101-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
103+
BuilderMap& builders);
102104

103105
/**
104106
* Append a web_page row (W_WEB_PAGE_TBL*) to Arrow builders.
105107
*/
106108
void append_web_page_to_builders(
107109
const void* row,
108-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
110+
BuilderMap& builders);
109111

110112
/**
111113
* Append a web_site row (W_WEB_SITE_TBL*) to Arrow builders.
112114
*/
113115
void append_web_site_to_builders(
114116
const void* row,
115-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
117+
BuilderMap& builders);
116118

117119
/**
118120
* Append a warehouse row (W_WAREHOUSE_TBL*) to Arrow builders.
119121
*/
120122
void append_warehouse_to_builders(
121123
const void* row,
122-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
124+
BuilderMap& builders);
123125

124126
/**
125127
* Append a ship_mode row (W_SHIP_MODE_TBL*) to Arrow builders.
126128
*/
127129
void append_ship_mode_to_builders(
128130
const void* row,
129-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
131+
BuilderMap& builders);
130132

131133
/**
132134
* Append a household_demographics row (W_HOUSEHOLD_DEMOGRAPHICS_TBL*) to Arrow builders.
133135
*/
134136
void append_household_demographics_to_builders(
135137
const void* row,
136-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
138+
BuilderMap& builders);
137139

138140
/**
139141
* Append a customer_demographics row (W_CUSTOMER_DEMOGRAPHICS_TBL*) to Arrow builders.
140142
*/
141143
void append_customer_demographics_to_builders(
142144
const void* row,
143-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
145+
BuilderMap& builders);
144146

145147
/**
146148
* Append a customer_address row (W_CUSTOMER_ADDRESS_TBL*) to Arrow builders.
147149
*/
148150
void append_customer_address_to_builders(
149151
const void* row,
150-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
152+
BuilderMap& builders);
151153

152154
/**
153155
* Append an income_band row (W_INCOME_BAND_TBL*) to Arrow builders.
154156
*/
155157
void append_income_band_to_builders(
156158
const void* row,
157-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
159+
BuilderMap& builders);
158160

159161
/**
160162
* Append a reason row (W_REASON_TBL*) to Arrow builders.
161163
*/
162164
void append_reason_to_builders(
163165
const void* row,
164-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
166+
BuilderMap& builders);
165167

166168
/**
167169
* Append a time_dim row (W_TIME_TBL*) to Arrow builders.
168170
*/
169171
void append_time_dim_to_builders(
170172
const void* row,
171-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
173+
BuilderMap& builders);
172174

173175
/**
174176
* Append a promotion row (W_PROMOTION_TBL*) to Arrow builders.
175177
*/
176178
void append_promotion_to_builders(
177179
const void* row,
178-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
180+
BuilderMap& builders);
179181

180182
/**
181183
* Append a store row (W_STORE_TBL*) to Arrow builders.
182184
*/
183185
void append_store_to_builders(
184186
const void* row,
185-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
187+
BuilderMap& builders);
186188

187189
/**
188190
* Generic dispatcher by table name.
189191
*/
190192
void append_dsdgen_row_to_builders(
191193
const std::string& table_name,
192194
const void* row,
193-
std::map<std::string, std::shared_ptr<arrow::ArrayBuilder>>& builders);
195+
BuilderMap& builders);
194196

195197
/**
196198
* Returns static dictionary Arrow array for dict8-encoded columns, or nullptr.

0 commit comments

Comments
 (0)