@@ -234,7 +234,11 @@ def merge_partial_indexes(output_dir: Path, quiet_mode: bool = False) -> None:
234234 workers have finished and written their partial indexes.
235235
236236 Partial indexes use JSONL format (one JSON object per line) for efficient
237- append-only writes during fill. Entries are validated with Pydantic here.
237+ append-only writes during fill.
238+
239+ Memory-optimized: Builds hash trie directly while streaming entries,
240+ avoiding accumulation of all entries in a single list. Writes final
241+ JSON by re-reading partials (2x I/O but ~50% less peak memory).
238242
239243 Args:
240244 output_dir: The fixture output directory.
@@ -247,58 +251,137 @@ def merge_partial_indexes(output_dir: Path, quiet_mode: bool = False) -> None:
247251 if not partial_files :
248252 raise Exception ("No partial indexes found." )
249253
250- # Merge all partial indexes (JSONL format: one entry per line)
251- # Read as raw dicts — the data was already validated when collected
252- # from live Pydantic fixture objects in add_fixture().
253- all_raw_entries : list [dict ] = []
254+ # Pass 1: Build hash trie directly while streaming (no intermediate list)
255+ # Only keep what's needed for hash computation: path parts and fixture_hash
256+ root_trie : dict = {}
254257 all_forks : set = set ()
255258 all_formats : set = set ()
259+ test_count = 0
256260
257261 for partial_file in partial_files :
258262 with open (partial_file ) as f :
259263 for line in f :
260264 line = line .strip ()
261265 if not line :
262266 continue
263- entry_data = json .loads (line )
264- all_raw_entries .append (entry_data )
265- # Collect forks and formats from raw strings
266- if entry_data .get ("fork" ):
267- all_forks .add (entry_data ["fork" ])
268- if entry_data .get ("format" ):
269- all_formats .add (entry_data ["format" ])
270-
271- # Compute root hash from raw dicts (no Pydantic needed)
272- root_hash = HashableItem .from_raw_entries (all_raw_entries ).hash ()
273-
274- # Build final index — Pydantic validates the entire structure once
275- # via model_validate(), not 96k individual model_validate() calls.
276- index = IndexFile .model_validate (
277- {
278- "test_cases" : all_raw_entries ,
279- "root_hash" : HexNumber (root_hash ),
280- "created_at" : datetime .datetime .now (),
281- "test_count" : len (all_raw_entries ),
282- "forks" : list (all_forks ),
283- "fixture_formats" : list (all_formats ),
284- }
285- )
267+ entry = json .loads (line )
268+ test_count += 1
269+
270+ # Collect metadata
271+ if entry .get ("fork" ):
272+ all_forks .add (entry ["fork" ])
273+ if entry .get ("format" ):
274+ all_formats .add (entry ["format" ])
275+
276+ # Insert directly into trie for hash computation
277+ fixture_hash = entry .get ("fixture_hash" )
278+ if not fixture_hash :
279+ continue
280+
281+ path_parts = Path (entry ["json_path" ]).parts
282+ current = root_trie
283+
284+ # Navigate to parent folder, creating nodes as needed
285+ for part in path_parts [:- 1 ]:
286+ if part not in current :
287+ current [part ] = {}
288+ current = current [part ]
289+
290+ # Add test entry to file node
291+ file_name = path_parts [- 1 ]
292+ if file_name not in current :
293+ current [file_name ] = []
286294
287- # Write final index
295+ hash_bytes = int (fixture_hash , 16 ).to_bytes (32 , "big" )
296+ current [file_name ].append ((entry ["id" ], hash_bytes ))
297+
298+ # Compute root hash from trie (reusing hasher's trie_to_hashable logic)
299+ root_hash = _trie_to_hash (root_trie )
300+
301+ # Free trie memory before pass 2
302+ del root_trie
303+
304+ # Pass 2: Stream entries to final JSON file (re-read partials)
305+ # This avoids keeping all entries in memory simultaneously
288306 index_path = meta_dir / "index.json"
289307 index_path .parent .mkdir (parents = True , exist_ok = True )
290- index_path .write_text (index .model_dump_json (exclude_none = True , indent = 2 ))
308+
309+ with open (index_path , "w" ) as out_f :
310+ # Write header
311+ out_f .write ("{\n " )
312+ out_f .write (f' "root_hash": "0x{ root_hash .hex ()} ",\n ' )
313+ out_f .write (
314+ f' "created_at": "{ datetime .datetime .now ().isoformat ()} ",\n '
315+ )
316+ out_f .write (f' "test_count": { test_count } ,\n ' )
317+ out_f .write (f' "forks": { json .dumps (sorted (all_forks ))} ,\n ' )
318+ out_f .write (
319+ f' "fixture_formats": { json .dumps (sorted (all_formats ))} ,\n '
320+ )
321+ out_f .write (' "test_cases": [\n ' )
322+
323+ # Stream test cases from partials (second read)
324+ first_entry = True
325+ for partial_file in partial_files :
326+ with open (partial_file ) as f :
327+ for line in f :
328+ line = line .strip ()
329+ if not line :
330+ continue
331+ if not first_entry :
332+ out_f .write (",\n " )
333+ first_entry = False
334+ # Write entry with indentation
335+ entry = json .loads (line )
336+ entry_json = json .dumps (entry , indent = 2 )
337+ # Indent each line of the entry
338+ indented = "\n " .join (
339+ " " + ln for ln in entry_json .split ("\n " )
340+ )
341+ out_f .write (indented )
342+
343+ out_f .write ("\n ]\n " )
344+ out_f .write ("}" )
291345
292346 if not quiet_mode :
293347 rich .print (
294348 f"[green]Merged { len (partial_files )} partial indexes "
295- f"({ len ( all_raw_entries ) } test cases) into { index_path } [/]"
349+ f"({ test_count } test cases) into { index_path } [/]"
296350 )
297351
298352 # Cleanup partial files
299353 for partial_file in partial_files :
300354 partial_file .unlink ()
301355
302356
357+ def _trie_to_hash (root_trie : dict ) -> bytes :
358+ """
359+ Compute hash from trie structure built during streaming.
360+
361+ Mirrors HashableItem.from_raw_entries logic but works on pre-built trie.
362+ """
363+ import hashlib
364+
365+ def hash_node (node : dict ) -> bytes :
366+ """Recursively hash a trie node."""
367+ hash_parts : list [bytes ] = []
368+
369+ for name in sorted (node .keys ()):
370+ child = node [name ]
371+ if isinstance (child , list ):
372+ # File node: child is list of (test_id, hash_bytes)
373+ # Hash = sha256(sorted test hashes concatenated)
374+ test_hashes = [h for _ , h in sorted (child , key = lambda x : x [0 ])]
375+ file_hash = hashlib .sha256 (b"" .join (test_hashes )).digest ()
376+ hash_parts .append (file_hash )
377+ else :
378+ # Folder node: recurse
379+ hash_parts .append (hash_node (child ))
380+
381+ return hashlib .sha256 (b"" .join (hash_parts )).digest ()
382+
383+ return hash_node (root_trie )
384+
385+
303386if __name__ == "__main__" :
304387 generate_fixtures_index_cli ()
0 commit comments