File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 11import json
22import logging
3+ import math
34import os
45import typing as t
56from dataclasses import dataclass
@@ -97,7 +98,19 @@ def _enumerate_json_files(root_path: str) -> list[str]:
9798 absolute_file_path = os .path .abspath (os .path .join (root_path , item ))
9899 files .append (absolute_file_path )
99100
100- return files
101+ # Super nasty code to allow generation of CAI data with separate processes
102+ # so I can speed it up. Pass the "SHARD" and "TOTAL_SHARDS" environment
103+ # variables to operate on the different parts of the data.
104+ if "SHARD" not in os .environ :
105+ return files
106+
107+ TOTAL_SHARDS = int (os .environ .get ("TOTAL_SHARDS" , 10 ))
108+ items_per_shard = math .floor (len (files ) / TOTAL_SHARDS )
109+
110+ shard = int (os .environ ["SHARD" ])
111+ file_range = (items_per_shard * shard , (items_per_shard * (shard + 1 )) - 1 )
112+
113+ return files [file_range [0 ]:file_range [1 ]]
101114
102115
103116def _available_json_data () -> t .Generator [dict [str , t .Any ], None , None ]:
You can’t perform that action at this time.
0 commit comments