11import io
2+ import os
23from dataclasses import dataclass
34from typing import Literal , Optional
45
@@ -50,6 +51,7 @@ class JsonConfig(datasets.BuilderConfig):
5051 chunksize : int = 10 << 20 # 10MB
5152 newlines_in_values : Optional [bool ] = None
5253 on_mixed_types : Optional [Literal ["use_json" ]] = "use_json"
54+ parse_agent_traces : bool = True
5355
5456 def __post_init__ (self ):
5557 super ().__post_init__ ()
@@ -83,13 +85,19 @@ def _split_generators(self, dl_manager):
8385 splits .append (
8486 datasets .SplitGenerator (
8587 name = split_name ,
86- gen_kwargs = {"files_iterables" : files_iterables , "base_files" : base_data_files [split_name ]},
88+ gen_kwargs = {
89+ "files_iterables" : files_iterables ,
90+ "base_files" : base_data_files [split_name ],
91+ "original_files" : self .config .data_files [split_name ],
92+ },
8793 )
8894 )
8995 if self .info .features is None :
9096 try :
9197 pa_table = next (iter (self ._generate_tables (** splits [0 ].gen_kwargs , allow_full_read = False )))[1 ]
9298 self .info .features = datasets .Features .from_arrow_schema (pa_table .schema )
99+ if self .config .parse_agent_traces and has_agent_traces_markers (self .info .features ):
100+ self .info .features = AGENT_TRACES_FEATURES
93101 except FullReadDisallowed :
94102 pass
95103 return splits
@@ -124,14 +132,18 @@ def _cast_table(self, pa_table: pa.Table, json_field_paths=()) -> pa.Table:
124132 pa_table = table_cast (pa_table , features .arrow_schema )
125133 return pa_table
126134
127- def _generate_shards (self , base_files , files_iterables ):
135+ def _generate_shards (self , base_files , files_iterables , original_files ):
128136 yield from base_files
129137
130- def _generate_tables (self , base_files , files_iterables , allow_full_read = True ):
138+ def _generate_tables (self , base_files , files_iterables , original_files , allow_full_read = True ):
131139 json_field_paths = []
140+ is_agent_traces = False
132141
133142 if self .info .features is not None :
134- json_field_paths = get_json_field_paths_from_feature (self .info .features )
143+ if self .info .features == AGENT_TRACES_FEATURES :
144+ is_agent_traces = True
145+ else :
146+ json_field_paths = get_json_field_paths_from_feature (self .info .features )
135147
136148 for shard_idx , files_iterable in enumerate (files_iterables ):
137149 for file in files_iterable :
@@ -149,6 +161,24 @@ def _generate_tables(self, base_files, files_iterables, allow_full_read=True):
149161 pa_table = pa .Table .from_pandas (df , preserve_index = False )
150162 yield Key (shard_idx , 0 ), self ._cast_table (pa_table )
151163
164+ # If the files are agent traces (one row = one file)
165+ elif is_agent_traces :
166+ with open (file , "r" , encoding = "utf-8" ) as f :
167+ traces = f .readlines ()
168+ harness , session_id = parse_traces_info (traces )
169+ file_path = original_files [shard_idx ]
170+ if file_path .startswith (self .base_path ):
171+ file_path = os .path .relpath (file_path , self .base_path )
172+ pa_table = pa .Table .from_pydict (
173+ {
174+ "harness" : [harness ],
175+ "session_id" : [session_id ],
176+ "traces" : [traces ],
177+ "file_path" : [file_path ],
178+ }
179+ )
180+ yield Key (shard_idx , 0 ), self ._cast_table (pa_table )
181+
152182 # If the file has one json object per line
153183 else :
154184 with open (file , "rb" ) as f :
@@ -265,3 +295,89 @@ def _generate_tables(self, base_files, files_iterables, allow_full_read=True):
265295 self ._cast_table (pa_table , json_field_paths = json_field_paths ),
266296 )
267297 batch_idx += 1
298+
299+
300+ AGENT_TRACES_TYPES_VALUES = {
301+ "claude_code" : ["user" , "assistant" , "system" ],
302+ "pi" : ["session" , "message" ],
303+ "codex" : ["session_meta" , "turn_context" , "response_item" , "event_msg" ],
304+ }
305+ AGENT_TRACES_TYPE_TO_HARNESS = {}
306+ for _harness , _trace_types in AGENT_TRACES_TYPES_VALUES .items ():
307+ for _trace_type in _trace_types :
308+ AGENT_TRACES_TYPE_TO_HARNESS [_trace_type ] = _harness
309+
310+
311+ AGENT_TRACES_FEATURES_MARKERS = {
312+ "claude_code" : datasets .Features (
313+ {
314+ "type" : datasets .Value ("string" ),
315+ "message" : datasets .Json (),
316+ }
317+ ),
318+ "pi" : datasets .Features (
319+ {
320+ "type" : datasets .Value ("string" ),
321+ "message" : datasets .Json (),
322+ }
323+ ),
324+ "codex" : datasets .Features (
325+ {
326+ "type" : datasets .Value ("string" ),
327+ "payload" : datasets .Json (),
328+ }
329+ ),
330+ }
331+
332+ AGENT_TRACES_FEATURES = datasets .Features (
333+ {
334+ "harness" : datasets .Value ("string" ),
335+ "session_id" : datasets .Value ("string" ),
336+ "traces" : datasets .List (datasets .Json ()),
337+ "file_path" : datasets .Value ("string" ),
338+ }
339+ )
340+
341+
342+ def has_agent_traces_markers (features : datasets .Features ) -> bool :
343+ for agent_traces_features_marker in AGENT_TRACES_FEATURES_MARKERS .values ():
344+ if all (features .get (key ) == feature for key , feature in agent_traces_features_marker .items ()):
345+ return True
346+ return False
347+
348+
349+ def parse_traces_info (traces : list [str ]) -> tuple [Optional [str ], Optional [str ]]:
350+ harness , session_id = None , None
351+ for trace in traces :
352+ decoded_trace = ujson_loads (trace )
353+ if harness is None :
354+ if "type" in decoded_trace and isinstance (decoded_trace ["type" ], str ):
355+ harness = AGENT_TRACES_TYPE_TO_HARNESS .get (decoded_trace ["type" ])
356+ if session_id is None :
357+ # claude
358+ if "sessionId" in decoded_trace and isinstance (decoded_trace ["sessionId" ], str ):
359+ session_id = decoded_trace ["sessionId" ]
360+ # claude (not sure but this format does exist online)
361+ elif "session_id" in decoded_trace and isinstance (decoded_trace ["session_id" ], str ):
362+ session_id = decoded_trace ["session_id" ]
363+ # codex
364+ elif (
365+ "payload" in decoded_trace
366+ and isinstance (decoded_trace ["payload" ], dict )
367+ and "id" in decoded_trace ["payload" ]
368+ and isinstance (decoded_trace ["payload" ]["id" ], str )
369+ ):
370+ session_id = decoded_trace ["payload" ]["id" ]
371+ # pi / openclaw (openclaw embeds pi-agent; distinguish via cwd)
372+ elif (
373+ "type" in decoded_trace
374+ and decoded_trace ["type" ] == "session"
375+ and "id" in decoded_trace
376+ and isinstance (decoded_trace ["id" ], str )
377+ ):
378+ session_id = decoded_trace ["id" ]
379+ if isinstance (decoded_trace .get ("cwd" ), str ) and "/.openclaw/" in decoded_trace ["cwd" ]:
380+ harness = "openclaw"
381+ if harness and session_id :
382+ break
383+ return harness , session_id
0 commit comments