Skip to content

Commit 0ed0f9b

Browse files
authored
Merge pull request #455 from nnasirinvidia/logging_schema_changes
Added more fields to the schema of mlperf training summary results
2 parents a4e663c + d9250f6 commit 0ed0f9b

1 file changed

Lines changed: 157 additions & 42 deletions

File tree

mlperf_logging/result_summarizer/result_summarizer.py

Lines changed: 157 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,22 @@
77
import os
88
import re
99
import sys
10+
import traceback
1011
import itertools
1112
import pandas as pd
1213
import yaml
14+
import numpy as np
1315
import hashlib
1416
import math
1517
import operator
1618
import uuid as uuidlib
19+
import copy
1720

1821
from ..compliance_checker import mlp_compliance
1922
from ..compliance_checker.mlp_compliance import usage_choices, rule_choices
2023
from ..compliance_checker.mlp_parser import parse_file
2124

25+
from ..rcp_checker import rcp_checker
2226
from ..benchmark_meta import get_allowed_benchmarks, get_result_file_counts
2327

2428

@@ -263,12 +267,23 @@ def _get_weak_scaling_metric_schema():
263267
}
264268

265269

266-
def _get_empty_summary(usage, ruleset, weak_scaling=False):
270+
def _get_strong_scaling_metric_schema():
271+
return {
272+
'time_to_train': float,
273+
'Energy': float,
274+
'GBS': float,
275+
'epochs': float,
276+
'RCP': str,
277+
'rcp_scaling_factor': float,
278+
}
279+
280+
281+
def _get_empty_summary(usage, ruleset, weak_scaling=False, detailed=False):
267282
return Summary(
268-
_get_column_schema(usage, ruleset, weak_scaling=weak_scaling).keys())
283+
_get_column_schema(usage, ruleset, weak_scaling=weak_scaling, detailed=detailed).keys())
269284

270285

271-
def _get_column_schema(usage, ruleset, weak_scaling=False):
286+
def _get_column_schema(usage, ruleset, weak_scaling=False, detailed=False):
272287
schema = {
273288
'division': str,
274289
'availability': str,
@@ -289,9 +304,17 @@ def _get_column_schema(usage, ruleset, weak_scaling=False):
289304
for metric, dtype in _get_weak_scaling_metric_schema().items():
290305
schema['{}:{}'.format(benchmark, metric)] = dtype
291306
else:
292-
schema.update(
293-
{b: float
294-
for b in get_allowed_benchmarks(usage, ruleset)})
307+
if detailed:
308+
benchmarks = get_allowed_benchmarks(usage, ruleset)
309+
for benchmark in benchmarks:
310+
for metric, dtype in _get_strong_scaling_metric_schema().items():
311+
schema['{}:{}'.format(benchmark, metric)] = dtype
312+
else:
313+
schema.update(
314+
{
315+
b: float for b in get_allowed_benchmarks(usage, ruleset)
316+
}
317+
)
295318
schema.update({'details_url': str, 'code_url': str})
296319
return schema
297320

@@ -404,8 +427,8 @@ def _compute_strong_score_standalone(
404427
power_score = olympic_avg
405428
power_score *= scaling_factor
406429
if return_full_scores:
407-
return scores_track, power_scores_track, score, power_score
408-
return score, power_score
430+
return scores_track, power_scores_track, score, power_score, scaling_factor
431+
return score, power_score, scaling_factor
409432

410433

411434
def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc = {"submitter": None}):
@@ -474,31 +497,106 @@ def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folde
474497

475498

476499

477-
def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset):
500+
def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division, rcp_bypass=False):
478501
# Collect scores for benchmarks.
479502
benchmark_scores = {}
480-
benchmark_power_scores = {}
481-
has_power = None
503+
detailed_bechmark_scores = {}
482504
benchmark_folder_parent = os.path.join(
483505
system_folder, 'strong') if usage == 'hpc' else system_folder
484506
if not os.path.isdir(benchmark_folder_parent):
485-
return benchmark_scores, benchmark_power_scores
507+
return benchmark_scores, {}
486508
for benchmark_folder in _get_sub_folders(benchmark_folder_parent):
487509
folder_parts = benchmark_folder.split('/')
488510
# Check if this benchmark has power results
489511
has_power = _has_power(benchmark_folder)
490512
benchmark = _benchmark_alias(folder_parts[-1])
491513
system = folder_parts[-3] if usage == 'hpc' else folder_parts[-2]
492-
# Read scores from result files.
493-
score, power_score = _compute_strong_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc)
514+
# Compute base perf/power scores
515+
score, power_score, rcp_scaling_factor = _compute_strong_score_standalone(
516+
benchmark, system, has_power, benchmark_folder, usage, ruleset, desc
517+
)
518+
519+
# RCP/GBS/Epochs additions for closed division
520+
benchmark_gbs = None
521+
benchmark_epochs = None
522+
benchmark_rcp = None
523+
if division == 'closed':
524+
pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder)
525+
result_files = glob.glob(pattern, recursive=True)
526+
try:
527+
# RCP check
528+
verbose = False
529+
bert_train_samples = False
530+
rcp_pass, rcp_msg, _ = rcp_checker.check_directory(
531+
benchmark_folder,
532+
usage,
533+
ruleset,
534+
verbose,
535+
bert_train_samples,
536+
rcp_file=None,
537+
rcp_pass='pruned_rcps',
538+
rcp_bypass=rcp_bypass,
539+
set_scaling=True,
540+
)
541+
if not rcp_pass:
542+
print(
543+
'ERROR: RCP Test Failed on {}/{}/{} with message: {}.'.format(
544+
desc['submitter'], system, benchmark, rcp_msg
545+
)
546+
)
547+
if rcp_msg == 'RCP found':
548+
benchmark_rcp = 'Fail'
549+
elif rcp_msg == 'RCP Interpolation':
550+
benchmark_rcp = 'Interp. Fail'
551+
elif 'Missing' in rcp_msg:
552+
benchmark_rcp = 'Missing'
553+
elif rcp_msg == 'Cannot find any RCPs':
554+
benchmark_rcp = 'No RCP'
555+
else:
556+
benchmark_rcp = 'Unknown state'
557+
else:
558+
benchmark_rcp = 'Pass'
559+
560+
# GBS and epochs
561+
benchmark_gbs, subm_epochs, _ = rcp_checker.get_submission_epochs(
562+
result_files, ruleset, bert_train_samples=False
563+
)
564+
subm_epochs.sort()
565+
samples_rejected = 1
566+
if len(subm_epochs) >= 2 * samples_rejected + 1:
567+
benchmark_epochs = float(
568+
np.mean(
569+
subm_epochs[
570+
samples_rejected : len(subm_epochs) - samples_rejected
571+
]
572+
)
573+
)
574+
except Exception as e:
575+
print(
576+
f"WARNING: RCP/GBS computation failed for {benchmark_folder}: {e}"
577+
)
578+
traceback.print_exc()
579+
580+
# Map into metric-suffixed keys for schema
581+
detailed_bechmark_scores[f"{benchmark}:rcp_scaling_factor"] = float(
582+
rcp_scaling_factor
583+
)
494584
if score is not None:
495-
benchmark_scores[benchmark] = score
585+
detailed_bechmark_scores[f"{benchmark}:time_to_train"] = score
586+
if benchmark_gbs is not None:
587+
detailed_bechmark_scores[f"{benchmark}:GBS"] = float(benchmark_gbs)
588+
if benchmark_epochs is not None:
589+
detailed_bechmark_scores[f"{benchmark}:samples_to_converge"] = float(benchmark_epochs)
590+
if benchmark_rcp is not None:
591+
detailed_bechmark_scores[f"{benchmark}:RCP"] = benchmark_rcp
496592
if power_score is not None:
497-
benchmark_power_scores[benchmark] = power_score
498-
_fill_empty_benchmark_scores(benchmark_scores, usage, ruleset)
499-
if len(benchmark_power_scores) > 0:
500-
_fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset)
501-
return benchmark_scores, benchmark_power_scores
593+
detailed_bechmark_scores[f"{benchmark}:Energy"] = power_score
594+
benchmark_scores[f"{benchmark}"] = float(
595+
rcp_scaling_factor
596+
)
597+
_fill_empty_benchmark_scores(benchmark_scores, usage, ruleset, detailed=False)
598+
_fill_empty_benchmark_scores(detailed_bechmark_scores, usage, ruleset, detailed=True)
599+
return benchmark_scores, detailed_bechmark_scores
502600

503601

504602
def _compute_weak_scaling_scores(desc, system_folder, usage, ruleset):
@@ -693,6 +791,7 @@ def _fill_empty_benchmark_scores(
693791
usage,
694792
ruleset,
695793
weak_scaling=False,
794+
detailed=False,
696795
):
697796
for benchmark in get_allowed_benchmarks(usage, ruleset):
698797
if weak_scaling:
@@ -702,8 +801,19 @@ def _fill_empty_benchmark_scores(
702801
benchmark_scores[k] = None
703802

704803
else:
705-
if benchmark not in benchmark_scores:
706-
benchmark_scores[benchmark] = None
804+
if detailed:
805+
strong_schema = _get_strong_scaling_metric_schema()
806+
for metric, dtype in strong_schema.items():
807+
k = '{}:{}'.format(benchmark, metric)
808+
if dtype is str:
809+
if k not in benchmark_scores or benchmark_scores[k] is None:
810+
benchmark_scores[k] = ''
811+
else:
812+
if k not in benchmark_scores:
813+
benchmark_scores[k] = None
814+
else:
815+
if benchmark not in benchmark_scores:
816+
benchmark_scores[benchmark] = None
707817

708818

709819
def _get_id_from_sysinfo(summary):
@@ -841,7 +951,7 @@ def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs):
841951
weak_scaling_summary = _get_empty_summary(usage,
842952
ruleset,
843953
weak_scaling=True)
844-
power_summary = _get_empty_summary(usage, ruleset)
954+
detailed_strong_scaling_summary = _get_empty_summary(usage, ruleset, detailed=True)
845955
power_weak_scaling_summary = _get_empty_summary(usage, ruleset, weak_scaling=True)
846956
for system_folder in _get_sub_folders(results_folder):
847957
folder_parts = system_folder.split('/')
@@ -924,8 +1034,8 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None):
9241034
continue
9251035

9261036
# Compute the scores.
927-
strong_scaling_scores, power_scores = _compute_strong_scaling_scores(
928-
desc, system_folder, usage, ruleset)
1037+
strong_scaling_scores, detailed_strong_scaling_scores = _compute_strong_scaling_scores(
1038+
desc, system_folder, usage, ruleset, system_specs["division"], rcp_bypass=False)
9291039
if usage == 'hpc':
9301040
weak_scaling_scores, power_scores_weak_scaling = _compute_weak_scaling_scores(
9311041
desc, system_folder, usage, ruleset)
@@ -950,17 +1060,18 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None):
9501060
urls.items(),
9511061
):
9521062
weak_scaling_summary.push(column_name, value)
953-
if len(power_scores) > 0:
1063+
if len(detailed_strong_scaling_scores) > 0:
9541064
for column_name, value in itertools.chain(
9551065
system_specs.items(),
956-
power_scores.items(),
1066+
detailed_strong_scaling_scores.items(),
9571067
urls.items(),
9581068
):
959-
power_summary.push(column_name, value)
960-
if column_name in strong_scaling_scores:
961-
power_summary.push(column_name, strong_scaling_scores[column_name])
962-
else:
963-
power_summary.push(column_name, value)
1069+
merged = (
1070+
detailed_strong_scaling_scores[column_name]
1071+
if column_name in detailed_strong_scaling_scores
1072+
else value
1073+
)
1074+
detailed_strong_scaling_summary.push(column_name, merged)
9641075
if usage == 'hpc' and len(power_scores_weak_scaling) > 0:
9651076
for column_name, value in itertools.chain(
9661077
system_specs.items(),
@@ -975,13 +1086,13 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None):
9751086
if len(weak_scaling_summary) > 0:
9761087
weak_scaling_summary = weak_scaling_summary.to_dataframe().sort_values(
9771088
_get_sort_by_column_names()).reset_index(drop=True)
978-
if len(power_summary) > 0:
979-
power_summary = power_summary.to_dataframe().sort_values(
1089+
if len(detailed_strong_scaling_summary) > 0:
1090+
detailed_strong_scaling_summary = detailed_strong_scaling_summary.to_dataframe().sort_values(
9801091
_get_sort_by_column_names()).reset_index(drop=True)
9811092
if len(power_weak_scaling_summary) > 0:
9821093
power_weak_scaling_summary = power_weak_scaling_summary.to_dataframe().sort_values(
9831094
_get_sort_by_column_names()).reset_index(drop=True)
984-
return strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary
1095+
return strong_scaling_summary, weak_scaling_summary, detailed_strong_scaling_summary, power_weak_scaling_summary
9851096

9861097

9871098

@@ -1039,23 +1150,23 @@ def main():
10391150

10401151
strong_scaling_summaries = []
10411152
weak_scaling_summaries = []
1042-
power_summaries = []
1153+
detailed_strong_scaling_summaries = []
10431154
power_weak_scaling_summaries = []
10441155

10451156
def _update_summaries(folder):
10461157
if args.usage == "Training":
10471158
config_path = os.path.join(os.path.dirname(__file__), "config.yaml")
10481159
with open(config_path, "r") as f:
10491160
config = yaml.safe_load(f)
1050-
strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary = summarize_results(
1161+
strong_scaling_summary, weak_scaling_summary, detailed_strong_scaling_summary, power_weak_scaling_summary = summarize_results(
10511162
folder,
10521163
args.usage,
10531164
args.ruleset,
10541165
availability = config["availability"],
10551166
generate_private_ids = args.generate_private_ids,
10561167
)
10571168
else:
1058-
strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary = summarize_results(
1169+
strong_scaling_summary, weak_scaling_summary, detailed_strong_scaling_summary, power_weak_scaling_summary = summarize_results(
10591170
folder,
10601171
args.usage,
10611172
args.ruleset,
@@ -1064,8 +1175,8 @@ def _update_summaries(folder):
10641175
strong_scaling_summaries.append(strong_scaling_summary)
10651176
if len(weak_scaling_summary) > 0:
10661177
weak_scaling_summaries.append(weak_scaling_summary)
1067-
if len(power_summary) > 0:
1068-
power_summaries.append(power_summary)
1178+
if len(detailed_strong_scaling_summary) > 0:
1179+
detailed_strong_scaling_summaries.append(detailed_strong_scaling_summary)
10691180
if len(power_weak_scaling_summary) > 0:
10701181
power_weak_scaling_summaries.append(power_weak_scaling_summary)
10711182

@@ -1180,13 +1291,14 @@ def _summaries_to_xlsx(summaries: pd.DataFrame, path, version):
11801291

11811292
writer.save()
11821293
# Print and write back results.
1183-
def _print_and_write(summaries, weak_scaling=False, mode='w', power = False):
1294+
def _print_and_write(summaries, weak_scaling=False, mode='w', power = False, detailed = False):
11841295
if len(summaries) > 0:
11851296
summaries = pd.concat(summaries).astype(
11861297
_get_column_schema(
11871298
args.usage,
11881299
args.ruleset,
11891300
weak_scaling=weak_scaling,
1301+
detailed=detailed
11901302
)
11911303
)
11921304
if weak_scaling:
@@ -1208,6 +1320,9 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False):
12081320
specs_and_notes = [c for c in summaries.columns if c not in benchmarks]
12091321
csv = csv.replace(".csv", "_power.csv")
12101322
summaries.groupby(specs_and_notes).apply(lambda x: agg_columns_fn(x, benchmarks)).to_csv(csv, mode=mode)
1323+
elif detailed:
1324+
csv = csv.replace(".csv", "_detailed.csv")
1325+
summaries.to_csv(csv, index=False, mode=mode)
12111326
else:
12121327
summaries.to_csv(csv, index=False, mode=mode)
12131328
json_path = "summary.json" if args.csv is None else f"""{csv.replace(".csv", ".json")}"""
@@ -1224,7 +1339,7 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False):
12241339
None, 'display.max_colwidth', None):
12251340
_print_and_write(strong_scaling_summaries)
12261341
_print_and_write(weak_scaling_summaries, weak_scaling=True, mode='a')
1227-
_print_and_write(power_summaries, mode='a', power=True)
1342+
_print_and_write(detailed_strong_scaling_summaries, mode='a', detailed=True)
12281343
_print_and_write(power_weak_scaling_summaries, weak_scaling=True, mode='a', power=True)
12291344

12301345

0 commit comments

Comments
 (0)