Skip to content

Commit cfa281c

Browse files
tsafinclaude
andcommitted
Embed dists.dss at compile time via CMake code generation
Add cmake/gen_dists.py, a Python script that runs at cmake configure time, parses dists.dss, and writes generated/dists_generated.c with: - 21 static set_member[] arrays with pre-computed cumulative weights (matching read_dist() logic exactly — no runtime parsing) - load_dists() that assigns pointers only: no malloc, no fopen, fully idempotent regardless of how many times it is called - dbgen_reset_seeds() (previously in tpch_init.c) CMakeLists.txt now runs the generator via execute_process() before add_subdirectory(third_party/dbgen) and registers CMAKE_CONFIGURE_DEPENDS on both dists.dss and gen_dists.py so cmake re-runs automatically when either file changes. third_party/dbgen/CMakeLists.txt replaces tpch_init.c with the generated file in the dbgen_objs source list. The dists.dss configure_file() copy is kept for users who invoke raw dbgen tools from the build directory, but the benchmark no longer requires the file to be present at runtime. Verified: all 8 TPC-H tables at SF=1 (--max-rows 0) produce correct row counts with dists.dss removed from the build directory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent b1cf2d0 commit cfa281c

3 files changed

Lines changed: 269 additions & 1 deletion

File tree

CMakeLists.txt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,34 @@ endif()
226226
# Include directories
227227
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
228228

229+
# Generate dists_generated.c from dists.dss at configure time.
230+
# This embeds all distribution data as static C arrays, eliminating runtime
231+
# file I/O and heap allocation for distribution loading.
232+
find_package(Python3 REQUIRED COMPONENTS Interpreter)
233+
234+
set(DISTS_DSS_PATH "${CMAKE_CURRENT_SOURCE_DIR}/third_party/tpch/dbgen/dists.dss")
235+
set(DISTS_GENERATED_C "${CMAKE_BINARY_DIR}/generated/dists_generated.c")
236+
237+
execute_process(
238+
COMMAND ${Python3_EXECUTABLE}
239+
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/gen_dists.py"
240+
"${DISTS_DSS_PATH}"
241+
"${DISTS_GENERATED_C}"
242+
RESULT_VARIABLE _gen_dists_result
243+
OUTPUT_VARIABLE _gen_dists_output
244+
ERROR_VARIABLE _gen_dists_error
245+
)
246+
if(_gen_dists_result)
247+
message(FATAL_ERROR "gen_dists.py failed: ${_gen_dists_error}")
248+
endif()
249+
message(STATUS "${_gen_dists_output}")
250+
251+
# Re-run cmake if dists.dss or the generator script changes
252+
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
253+
"${DISTS_DSS_PATH}"
254+
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/gen_dists.py"
255+
)
256+
229257
# Build dbgen objects
230258
add_subdirectory(third_party/dbgen EXCLUDE_FROM_ALL)
231259
include_directories(${DBGEN_INCLUDE_DIRS})

cmake/gen_dists.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
#!/usr/bin/env python3
2+
"""
3+
gen_dists.py - Generate C source from dists.dss at CMake configure time.
4+
5+
Usage: gen_dists.py <input.dss> <output.c>
6+
7+
Reads all BEGIN...END blocks from dists.dss, pre-computes cumulative weights
8+
(exactly as read_dist() in bm_utils.c does), and emits a C source file with:
9+
- static set_member[] arrays (one per distribution)
10+
- load_dists() that just assigns pointers -- no malloc, no file I/O
11+
- dbgen_reset_seeds() (replaces tpch_init.c entirely)
12+
13+
The generated load_dists() is idempotent: pointer assignments are safe to repeat.
14+
"""
15+
16+
import sys
17+
import os
18+
19+
# Mapping: dists.dss distribution name (lower-case) -> C global variable name.
20+
# Only distributions whose C variables are actually defined in the compiled code
21+
# (dbgen_stubs.c) are listed here. Others (nouns, verbs, p_names, Q13a, Q13b)
22+
# are either unused or have no corresponding C definition.
23+
DIST_MAP = {
24+
"p_cntr": "p_cntr_set",
25+
"colors": "colors",
26+
"p_types": "p_types_set",
27+
"nations": "nations",
28+
"nations2": "nations2",
29+
"regions": "regions",
30+
"o_oprio": "o_priority_set",
31+
"instruct": "l_instruct_set",
32+
"smode": "l_smode_set",
33+
"category": "l_category_set",
34+
"rflag": "l_rflag_set",
35+
"msegmnt": "c_mseg_set",
36+
# text-generation distributions (defined in dbgen_stubs.c)
37+
"adverbs": "adverbs",
38+
"articles": "articles",
39+
"prepositions": "prepositions",
40+
"auxillaries": "auxillaries",
41+
"terminators": "terminators",
42+
"adjectives": "adjectives",
43+
"grammar": "grammar",
44+
"np": "np",
45+
"vp": "vp",
46+
}
47+
48+
49+
def escape_c_string(s):
50+
"""Escape a string for safe embedding in a C double-quoted literal."""
51+
return s.replace('\\', '\\\\').replace('"', '\\"')
52+
53+
54+
def parse_dists(filepath):
55+
"""
56+
Parse dists.dss. Returns a dict:
57+
dist_name (lower-case) -> {
58+
'count': int,
59+
'max': int, # final cumulative weight (= target->max)
60+
'entries': [(cum_weight, token_text), ...],
61+
'var': str, # C variable name
62+
}
63+
64+
Weight accumulation mirrors read_dist() in bm_utils.c exactly:
65+
target->max += weight;
66+
target->list[count].weight = target->max; (cumulative!)
67+
"""
68+
distributions = {}
69+
70+
with open(filepath, 'r') as f:
71+
lines = f.readlines()
72+
73+
i = 0
74+
while i < len(lines):
75+
line = lines[i].rstrip('\n')
76+
77+
# Strip comments (first '#' and everything after)
78+
if '#' in line:
79+
line = line[:line.index('#')]
80+
line = line.strip()
81+
82+
if not line:
83+
i += 1
84+
continue
85+
86+
# Look for "BEGIN <name>"
87+
parts = line.split()
88+
if len(parts) == 2 and parts[0].upper() == 'BEGIN':
89+
dist_name = parts[1].lower()
90+
entries = []
91+
count = None
92+
cumulative = 0
93+
94+
i += 1
95+
while i < len(lines):
96+
inner = lines[i].rstrip('\n')
97+
if '#' in inner:
98+
inner = inner[:inner.index('#')]
99+
inner = inner.strip()
100+
i += 1
101+
102+
if not inner:
103+
continue
104+
105+
# END terminates the block
106+
if inner.upper().startswith('END'):
107+
break
108+
109+
# Expect "token|weight"
110+
if '|' not in inner:
111+
continue
112+
113+
pipe = inner.index('|') # first '|', matching sscanf %[^|]
114+
token = inner[:pipe].strip()
115+
weight_str = inner[pipe + 1:].strip()
116+
117+
try:
118+
weight = int(weight_str)
119+
except ValueError:
120+
continue
121+
122+
if token.lower() == 'count':
123+
count = weight
124+
continue
125+
126+
cumulative += weight
127+
entries.append((cumulative, token))
128+
129+
if count is not None and dist_name in DIST_MAP:
130+
distributions[dist_name] = {
131+
'count': count,
132+
'max': cumulative,
133+
'entries': entries,
134+
'var': DIST_MAP[dist_name],
135+
}
136+
else:
137+
i += 1
138+
139+
return distributions
140+
141+
142+
def gen_c_source(distributions, input_path):
143+
"""Return the text of the generated C source file."""
144+
out = []
145+
146+
out.append('/* Auto-generated from {} by cmake/gen_dists.py -- do not edit */'.format(
147+
os.path.basename(input_path)))
148+
out.append('')
149+
out.append('/* dss.h uses EXTERN which becomes "extern" unless DECLARER is set.')
150+
out.append(' * We do NOT set DECLARER here; distributions are defined in dbgen_stubs.c.')
151+
out.append(' * We only assign their fields inside load_dists(). */')
152+
out.append('')
153+
out.append('#include "dss.h"')
154+
out.append('#include "dsstypes.h"')
155+
out.append('')
156+
157+
load_stmts = []
158+
159+
# Emit one static array per distribution, sorted for deterministic output
160+
for dist_name in sorted(distributions.keys()):
161+
info = distributions[dist_name]
162+
var = info['var']
163+
count = info['count']
164+
max_val = info['max']
165+
entries = info['entries']
166+
arr_name = 'g_{}_list'.format(var)
167+
168+
out.append('/* -- {} ({} entries, max={}) -- */'.format(
169+
dist_name, count, max_val))
170+
out.append('static set_member {}[{}] = {{'.format(arr_name, count))
171+
172+
for idx, (cum_weight, token) in enumerate(entries):
173+
escaped = escape_c_string(token)
174+
comma = ',' if idx < len(entries) - 1 else ''
175+
out.append(' {{ {}L, (char*)"{}" }}{}'.format(
176+
cum_weight, escaped, comma))
177+
178+
out.append('};')
179+
out.append('')
180+
181+
load_stmts.append(
182+
' {var}.count = {count}; {var}.max = {max}L;'.format(
183+
var=var, count=count, max=max_val))
184+
load_stmts.append(
185+
' {var}.list = {arr}; {var}.permute = (long*)NULL;'.format(
186+
var=var, arr=arr_name))
187+
188+
# Emit load_dists()
189+
out.append('void load_dists(void)')
190+
out.append('{')
191+
out.extend(load_stmts)
192+
out.append('}')
193+
out.append('')
194+
195+
# Emit dbgen_reset_seeds() -- was in tpch_init.c
196+
out.append('/* dbgen_reset_seeds() -- replaces tpch_init.c */')
197+
out.append('extern seed_t Seed[];')
198+
out.append('')
199+
out.append('void dbgen_reset_seeds(void)')
200+
out.append('{')
201+
out.append(' int i;')
202+
out.append(' for (i = 0; i <= MAX_STREAM; i++) {')
203+
out.append(' Seed[i].usage = 0;')
204+
out.append(' }')
205+
out.append('}')
206+
out.append('')
207+
208+
return '\n'.join(out)
209+
210+
211+
def main():
212+
if len(sys.argv) != 3:
213+
print('Usage: gen_dists.py <input.dss> <output.c>', file=sys.stderr)
214+
sys.exit(1)
215+
216+
input_path = sys.argv[1]
217+
output_path = sys.argv[2]
218+
219+
if not os.path.exists(input_path):
220+
print('Error: input file not found: {}'.format(input_path), file=sys.stderr)
221+
sys.exit(1)
222+
223+
distributions = parse_dists(input_path)
224+
225+
out_dir = os.path.dirname(output_path)
226+
if out_dir:
227+
os.makedirs(out_dir, exist_ok=True)
228+
229+
content = gen_c_source(distributions, input_path)
230+
231+
with open(output_path, 'w') as f:
232+
f.write(content)
233+
234+
total_entries = sum(info['count'] for info in distributions.values())
235+
print('Generated distributions from dists.dss ({} distributions, {} entries)'.format(
236+
len(distributions), total_entries))
237+
238+
239+
if __name__ == '__main__':
240+
main()

third_party/dbgen/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ set(DBGEN_CORE_SOURCES
1919
${DBGEN_SOURCE_DIR}/permute.c
2020
${DBGEN_SOURCE_DIR}/speed_seed.c
2121
${DBGEN_SOURCE_DIR}/bcd2.c
22-
${DBGEN_SOURCE_DIR}/tpch_init.c
22+
"${CMAKE_BINARY_DIR}/generated/dists_generated.c"
2323
"${CMAKE_SOURCE_DIR}/src/dbgen/dbgen_stubs.c"
2424
)
2525

0 commit comments

Comments
 (0)