Skip to content

Commit 411f60b

Browse files
Merge pull request #90 from evaleval/fix_different_uuids_for_json_and_jsonl
Fix different uuids for json and jsonl files for the same log in the CLI
2 parents 332304e + eda456c commit 411f60b

8 files changed

Lines changed: 380 additions & 49 deletions

File tree

every_eval_ever/cli.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import argparse
66
import json
7+
import os
78
import sys
89
import uuid
910
from pathlib import Path
@@ -108,23 +109,37 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int:
108109

109110

110111
def _cmd_convert_inspect(args: argparse.Namespace) -> int:
111-
from every_eval_ever.converters.inspect.adapter import InspectAIAdapter
112+
from every_eval_ever.converters.inspect.adapter import (
113+
InspectAIAdapter,
114+
list_eval_logs,
115+
)
112116

113117
adapter = InspectAIAdapter()
114118
metadata = _common_metadata(args)
115-
metadata['file_uuid'] = str(uuid.uuid4())
116119

117120
log_path = Path(args.log_path)
121+
eval_uuids: list[str]
118122
if log_path.is_file():
123+
eval_uuids = [str(uuid.uuid4())]
124+
metadata['file_uuid'] = eval_uuids[0]
119125
logs = [adapter.transform_from_file(log_path, metadata)]
120126
elif log_path.is_dir():
127+
eval_paths = list_eval_logs(log_path.absolute().as_posix())
128+
eval_uuids = [str(uuid.uuid4()) for _ in eval_paths]
129+
metadata['file_uuids'] = eval_uuids
121130
logs = adapter.transform_from_directory(log_path, metadata)
122131
else:
123132
raise FileNotFoundError(f'Path is not a file or directory: {log_path}')
124133

134+
if len(logs) != len(eval_uuids):
135+
raise RuntimeError(
136+
'Inspect conversion produced a different number of logs than '
137+
'the generated UUID list.'
138+
)
139+
125140
output_dir = Path(args.output_dir)
126-
for log in logs:
127-
print(_write_log(log, output_dir))
141+
for log, eval_uuid in zip(logs, eval_uuids):
142+
print(_write_log(log, output_dir, eval_uuid=eval_uuid))
128143

129144
print(f'Converted {len(logs)} evaluation log(s).')
130145
return 0
@@ -135,16 +150,39 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int:
135150

136151
adapter = HELMAdapter()
137152
metadata = _common_metadata(args)
138-
metadata['file_uuid'] = str(uuid.uuid4())
153+
log_path = Path(args.log_path)
154+
155+
eval_uuids: list[str]
156+
if adapter._directory_contains_required_files(log_path):
157+
eval_uuids = [str(uuid.uuid4())]
158+
metadata['file_uuid'] = eval_uuids[0]
159+
elif log_path.is_dir():
160+
run_dirs = [
161+
entry.path
162+
for entry in os.scandir(log_path)
163+
if entry.is_dir()
164+
and adapter._directory_contains_required_files(entry.path)
165+
]
166+
eval_uuids = [str(uuid.uuid4()) for _ in run_dirs]
167+
metadata['file_uuids'] = eval_uuids
168+
else:
169+
raise FileNotFoundError(f'Path is not a file or directory: {log_path}')
139170

140171
logs = adapter.transform_from_directory(
141-
Path(args.log_path),
172+
log_path,
142173
output_path=str(Path(args.output_dir) / 'helm_output'),
143174
metadata_args=metadata,
144175
)
176+
177+
if len(logs) != len(eval_uuids):
178+
raise RuntimeError(
179+
'HELM conversion produced a different number of logs than '
180+
'the generated UUID list.'
181+
)
182+
145183
output_dir = Path(args.output_dir)
146-
for log in logs:
147-
print(_write_log(log, output_dir))
184+
for log, eval_uuid in zip(logs, eval_uuids):
185+
print(_write_log(log, output_dir, eval_uuid=eval_uuid))
148186

149187
print(f'Converted {len(logs)} evaluation log(s).')
150188
return 0

every_eval_ever/converters/common/utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
import hashlib
2+
import re
23
from datetime import datetime
3-
from typing import Dict
4+
from pathlib import Path
5+
from typing import Any, Dict
46

57
from huggingface_hub import HfApi
68

9+
_UUID_FILE_RE = re.compile(
10+
r'(?P<uuid>[0-9a-f]{8}-[0-9a-f]{4}-[1-8][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12})(?:_samples)?(?:\.jsonl?)?$',
11+
re.IGNORECASE,
12+
)
13+
714

815
def convert_timestamp_to_unix_format(timestamp: str) -> str:
916
dt = datetime.fromisoformat(timestamp)
@@ -78,3 +85,20 @@ def sha256_file(path, chunk_size=8192):
7885

7986
def sha256_string(text: str, chunk_size=8192):
8087
return hashlib.sha256(text.encode('utf-8')).hexdigest()
88+
89+
90+
def extract_file_uuid_from_detailed_results(log: Any) -> str | None:
91+
detailed = getattr(log, 'detailed_evaluation_results', None)
92+
if not detailed:
93+
return None
94+
95+
file_path = getattr(detailed, 'file_path', None)
96+
if not file_path:
97+
return None
98+
99+
filename = Path(str(file_path)).name
100+
uuid_match = _UUID_FILE_RE.search(filename)
101+
if uuid_match:
102+
return uuid_match.group('uuid')
103+
104+
return None

every_eval_ever/converters/helm/adapter.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22
import json
33
import os
4+
import uuid
45
from pathlib import Path
56
from typing import Any, Dict, List, Tuple
67

@@ -68,7 +69,8 @@ def _require_helm_dependencies() -> None:
6869
if _HELM_IMPORT_ERROR is not None:
6970
raise ImportError(
7071
'HELM converter dependencies are missing. '
71-
"Install with: pip install 'every_eval_ever[helm]'"
72+
"Install with: uv sync --extra helm "
73+
"(or pip install 'every_eval_ever[helm]')."
7274
) from _HELM_IMPORT_ERROR
7375

7476

@@ -171,18 +173,46 @@ def transform_from_directory(
171173
# all_instance_logs: List[InstanceLevelEvaluationLog] = []
172174
aggregate_logs: List[EvaluationLog] = []
173175

176+
file_uuids = metadata_args.get('file_uuids')
177+
174178
if self._directory_contains_required_files(dir_path):
175179
data = self._load_evaluation_run_logfiles(dir_path)
176-
agg = self._transform_single(data, metadata_args)
180+
per_log_metadata_args = dict(metadata_args)
181+
if (
182+
isinstance(file_uuids, list)
183+
and file_uuids
184+
and file_uuids[0]
185+
):
186+
per_log_metadata_args['file_uuid'] = file_uuids[0]
187+
else:
188+
per_log_metadata_args['file_uuid'] = metadata_args.get(
189+
'file_uuid'
190+
) or str(uuid.uuid4())
191+
agg = self._transform_single(data, per_log_metadata_args)
177192
aggregate_logs.append(agg)
178193
else:
194+
converted_idx = 0
179195
for entry in os.scandir(dir_path):
180196
if entry.is_dir() and self._directory_contains_required_files(
181197
entry.path
182198
):
183199
data = self._load_evaluation_run_logfiles(entry.path)
184-
agg = self._transform_single(data, metadata_args)
200+
per_log_metadata_args = dict(metadata_args)
201+
if (
202+
isinstance(file_uuids, list)
203+
and converted_idx < len(file_uuids)
204+
and file_uuids[converted_idx]
205+
):
206+
per_log_metadata_args['file_uuid'] = file_uuids[
207+
converted_idx
208+
]
209+
else:
210+
per_log_metadata_args['file_uuid'] = str(
211+
uuid.uuid4()
212+
)
213+
agg = self._transform_single(data, per_log_metadata_args)
185214
aggregate_logs.append(agg)
215+
converted_idx += 1
186216

187217
# # Write all consolidated instance logs to JSONL
188218
# with open(output_path, 'w', encoding='utf-8') as f:

every_eval_ever/converters/inspect/__main__.py

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from inspect_ai.log import list_eval_logs
1313

1414
from every_eval_ever.converters.inspect.adapter import InspectAIAdapter
15-
from every_eval_ever.converters.inspect.supplemental_eval_details import SupplementalEvalDetails
15+
from every_eval_ever.converters.inspect.supplemental_eval_details import (
16+
SupplementalEvalDetails,
17+
)
1618
except ImportError as exc:
1719
raise SystemExit(
1820
"The 'inspect-ai' package is required to use the Inspect AI converter.\n"
@@ -165,16 +167,6 @@ def save_evaluation_log(
165167
return False
166168

167169

168-
def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None:
169-
detailed = unified_output.detailed_evaluation_results
170-
if detailed and detailed.file_path:
171-
stem = Path(detailed.file_path).stem
172-
suffix = '_samples'
173-
if stem.endswith(suffix):
174-
return stem[: -len(suffix)]
175-
return None
176-
177-
178170
if __name__ == '__main__':
179171
logging.basicConfig(level=logging.INFO)
180172
args = parse_args()
@@ -216,21 +208,14 @@ def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None:
216208
metadata_args
217209
)
218210
if unified_output and isinstance(unified_output, List):
219-
for idx, single_unified_output in enumerate(unified_output):
220-
file_uuid = (
221-
file_uuids[idx] if idx < len(file_uuids) else None
211+
if len(unified_output) != len(file_uuids):
212+
raise RuntimeError(
213+
'Inspect conversion produced a different number of '
214+
'logs than the generated UUID list.'
222215
)
223-
if not file_uuid:
224-
file_uuid = extract_file_uuid_from_output(
225-
single_unified_output
226-
)
227-
if not file_uuid:
228-
file_uuid = str(uuid.uuid4())
229-
logger.warning(
230-
'Missing UUID for output %s; generated %s for aggregate save.',
231-
single_unified_output.evaluation_id,
232-
file_uuid,
233-
)
216+
for single_unified_output, file_uuid in zip(
217+
unified_output, file_uuids
218+
):
234219
save_evaluation_log(
235220
single_unified_output,
236221
inspect_converter,

every_eval_ever/converters/inspect/utils.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,10 @@
11
import json
22
import re
33
from pathlib import Path
4+
from typing import Any, Dict, List, Type
45

56
from pydantic import BaseModel
6-
from typing import Any, Dict, List, Type
77

8-
from every_eval_ever.eval_types import (
9-
AgenticEvalConfig,
10-
EvaluationResult,
11-
GenerationArgs,
12-
GenerationConfig,
13-
InferenceEngine,
14-
MetricConfig,
15-
ModelInfo
16-
)
178
from every_eval_ever.converters.common.utils import get_model_organization_info
189
from every_eval_ever.converters.inspect.supplemental_eval_details import (
1910
SupplementalAgenticEvalConfig,
@@ -22,6 +13,15 @@
2213
SupplementalGenerationConfig,
2314
SupplementalSourceData,
2415
)
16+
from every_eval_ever.eval_types import (
17+
AgenticEvalConfig,
18+
EvaluationResult,
19+
GenerationArgs,
20+
GenerationConfig,
21+
InferenceEngine,
22+
MetricConfig,
23+
ModelInfo,
24+
)
2525

2626

2727
class ModelPathHandler:
@@ -549,4 +549,4 @@ def apply_supplemental_eval_details(
549549
supplement = unnamed_supplements[unnamed_idx]
550550
unnamed_idx += 1
551551

552-
apply_result_supplement(evaluation_result, supplement)
552+
apply_result_supplement(evaluation_result, supplement)

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ dependencies = [
2222

2323
[project.optional-dependencies]
2424
inspect = ["inspect-ai>=0.3.160,<0.4.0"]
25-
helm = ["crfm-helm>=0.5.12"]
25+
helm = [
26+
"crfm-helm>=0.5.12",
27+
"typer>=0.12,<1.0",
28+
]
2629
all = [
2730
"every-eval-ever[inspect]",
2831
"every-eval-ever[helm]",

0 commit comments

Comments
 (0)