Skip to content

Commit f02f577

Browse files
Martin LindströmMartin Lindström
authored andcommitted
Arm backend: Add evaluate_model.py
This patch reimplements the evaluation feature that used to be in aot_arm_compiler.py while introducing a few improvements. The program is evaluate_model.py and it imports functions from aot_arm_compiler.py to compile a model in a similar manner, but runs its own code that is focused on evaluating a model using the evaluators classes in backends/arm/util/arm_model_evaluator.py. The following is supported in evaluate_model.py: - TOSA reference models (INT, FP). - Evaluating a model that is quantized and/or lowered. I.e., it is possible to evaluate a model that is quantized but not lowered, lowered but not quantized, or both at the same time. - The program can cast the model with the --dtype flag to evaluate a model in e.g., bf16 or fp16 format. Signed-off-by: Martin Lindström <Martin.Lindstroem@arm.com> Change-Id: I85f731633364da1eb71abe602a0335f531ec7e46
1 parent 76df414 commit f02f577

2 files changed

Lines changed: 395 additions & 1 deletion

File tree

Lines changed: 393 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,393 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
# Copyright 2023-2026 Arm Limited and/or its affiliates.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
import argparse
9+
import copy
10+
import json
11+
import logging
12+
import os
13+
import sys
14+
15+
from pathlib import Path
16+
17+
# Add Executorch root to path so this script can be run from anywhere
18+
_EXECUTORCH_DIR = Path(__file__).resolve().parents[3]
19+
_EXECUTORCH_DIR_STR = str(_EXECUTORCH_DIR)
20+
if _EXECUTORCH_DIR_STR not in sys.path:
21+
sys.path.insert(0, _EXECUTORCH_DIR_STR)
22+
23+
from typing import Any
24+
25+
import torch
26+
27+
from backends.arm.util.arm_model_evaluator import (
28+
Evaluator,
29+
FileCompressionEvaluator,
30+
ImageNetEvaluator,
31+
NumericalModelEvaluator,
32+
)
33+
from examples.arm.aot_arm_compiler import (
34+
CALIBRATION_MAX_SAMPLES,
35+
dump_delegation_info,
36+
get_model_and_inputs_from_name,
37+
load_calibration_samples,
38+
quantize_model,
39+
QuantMode,
40+
)
41+
42+
from examples.models import MODEL_NAME_TO_MODEL
43+
from executorch.backends.arm.tosa import TosaSpecification
44+
from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
45+
from executorch.backends.arm.util._factory import create_partitioner
46+
47+
from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
48+
from torch.utils.data import DataLoader
49+
50+
51+
_EVALUATORS = [
52+
"numerical",
53+
"imagenet",
54+
]
55+
56+
_QUANT_MODES = [
57+
"int8",
58+
"a16w8",
59+
]
60+
61+
_DTYPE_MAP = {
62+
"fp32": torch.float32,
63+
"fp16": torch.float16,
64+
"bf16": torch.bfloat16,
65+
}
66+
67+
68+
def _get_args():
69+
parser = argparse.ArgumentParser(
70+
"Evaluate a model quantized and/or delegated for the Arm backend."
71+
" Evaluations include numerical comparison to the original model"
72+
"and/or top-1/top-5 accuracy if applicable."
73+
)
74+
parser.add_argument(
75+
"-m",
76+
"--model_name",
77+
required=True,
78+
help="Model file .py/.pth/.pt or a model from examples/models."
79+
f" Available models from examples/models: {', '.join(MODEL_NAME_TO_MODEL.keys())}",
80+
)
81+
parser.add_argument(
82+
"-t",
83+
"--target",
84+
action="store",
85+
required=True,
86+
help=(
87+
"For Arm backend delegated models, pick the target."
88+
" Examples of valid targets: TOSA-1.0+INT, TOSA-1.0+FP+bf16"
89+
),
90+
)
91+
parser.add_argument(
92+
"-q",
93+
"--quant_mode",
94+
required=False,
95+
default=None,
96+
choices=_QUANT_MODES,
97+
help="Quantize the model using the requested mode.",
98+
)
99+
parser.add_argument(
100+
"--calibration_data",
101+
required=False,
102+
default=None,
103+
help=(
104+
"Optional calibration data file or directory. If a directory is "
105+
"provided, up to 1000 samples are used for calibration. "
106+
"Supported files: Common image formats (e.g., .png or .jpg) if "
107+
"using imagenet evaluator, otherwise .pt/.pth files. If not provided,"
108+
"quantized models are calibrated on their example inputs."
109+
),
110+
)
111+
parser.add_argument(
112+
"--no_delegate",
113+
action="store_false",
114+
dest="delegate",
115+
default=True,
116+
help=(
117+
"Disable delegation for cases where a quantized but non-delegated "
118+
"model is to be tested."
119+
),
120+
)
121+
parser.add_argument(
122+
"-e",
123+
"--evaluators",
124+
required=True,
125+
help=(
126+
"Comma-separated list of evaluators to use. " f"Valid values: {_EVALUATORS}"
127+
),
128+
)
129+
parser.add_argument(
130+
"--evaluation_dataset",
131+
required=False,
132+
default=None,
133+
help="Provide path to evaluation dataset directory. (only applicable for ImageNet evaluation).",
134+
)
135+
parser.add_argument(
136+
"--batch_size",
137+
type=int,
138+
required=False,
139+
default=1,
140+
help="Batch size to use for ImageNet evaluation. (only applicable for ImageNet evaluation).",
141+
)
142+
parser.add_argument(
143+
"-s",
144+
"--so_library",
145+
required=False,
146+
default=None,
147+
help="Path to .so library to load custom ops from before evaluation.",
148+
)
149+
parser.add_argument(
150+
"--debug", action="store_true", help="Set the logging level to debug."
151+
)
152+
parser.add_argument(
153+
"--dtype",
154+
choices=sorted(_DTYPE_MAP.keys()),
155+
default=None,
156+
help="Cast the model to evaluate and its inputs to the given dtype.",
157+
)
158+
parser.add_argument(
159+
"-i",
160+
"--intermediates",
161+
action="store",
162+
required=True,
163+
help="Store intermediate output (like TOSA artifacts) at the specified directory.",
164+
)
165+
parser.add_argument(
166+
"-o",
167+
"--output",
168+
required=False,
169+
default=None,
170+
help="Path to JSON file where evaluation metrics will be stored.",
171+
)
172+
args = parser.parse_args()
173+
174+
LOGGING_FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
175+
logging_level = logging.DEBUG if args.debug else logging.WARNING
176+
logging.basicConfig(level=logging_level, format=LOGGING_FORMAT, force=True)
177+
178+
if args.quant_mode is None and not args.delegate:
179+
raise ValueError(
180+
"The model to test must be either quantized or delegated (--quant_mode or --delegate)."
181+
)
182+
183+
if args.calibration_data is not None and args.quant_mode is None:
184+
raise ValueError("--calibration_data requires --quant_mode to be enabled.")
185+
186+
if args.quant_mode is not None and args.dtype is not None:
187+
raise ValueError("Cannot specify --dtype when --quant_mode is enabled.")
188+
189+
evaluators: list[Evaluator] = [
190+
entry.strip() for entry in args.evaluators.split(",") if entry.strip()
191+
]
192+
unknown = [entry for entry in evaluators if entry not in _EVALUATORS]
193+
if not evaluators:
194+
raise ValueError("At least one evaluator must be specified in --evaluators.")
195+
if unknown:
196+
raise ValueError(
197+
"Unknown evaluators in --evaluators: " f"{', '.join(sorted(set(unknown)))}"
198+
)
199+
args.evaluators = evaluators
200+
201+
if "imagenet" in args.evaluators and args.evaluation_dataset is None:
202+
raise ValueError("Evaluation dataset must be provided for ImageNet evaluation.")
203+
204+
# Default output path to intermediates folder with name based on target and extensions
205+
if args.output is None:
206+
args.output = os.path.join(args.intermediates, f"{args.target}_metrics.json")
207+
208+
try:
209+
TosaSpecification.create_from_string(args.target)
210+
except ValueError as e:
211+
raise ValueError(f"Invalid target format for --target: {e}")
212+
213+
return args
214+
215+
216+
def _get_compile_spec(args) -> TosaCompileSpec:
217+
tosa_spec = TosaSpecification.create_from_string(args.target)
218+
compile_spec = TosaCompileSpec(tosa_spec)
219+
220+
if args.intermediates is not None:
221+
compile_spec.dump_intermediate_artifacts_to(args.intermediates)
222+
223+
return compile_spec
224+
225+
226+
def _build_imagenet_calibration_samples(
227+
calibration_dir: str, max_samples: int
228+
) -> list[tuple[torch.Tensor, ...]]:
229+
dataset = ImageNetEvaluator.load_imagenet_folder(calibration_dir)
230+
loader = DataLoader(dataset, batch_size=1, shuffle=False)
231+
samples: list[tuple[torch.Tensor, ...]] = []
232+
for image, _ in loader:
233+
samples.append((image,))
234+
if len(samples) >= max_samples:
235+
break
236+
return samples
237+
238+
239+
def _evaluate(
240+
args, model_name, ref_model, eval_model, example_inputs
241+
) -> dict[str, Any]:
242+
evaluators: list[Evaluator] = []
243+
244+
# Add evaluator for compression ratio of TOSA file
245+
intermediates_path = Path(args.intermediates)
246+
tosa_paths = list(intermediates_path.glob("*.tosa"))
247+
if tosa_paths:
248+
evaluators.append(FileCompressionEvaluator(model_name, str(tosa_paths[0])))
249+
else:
250+
logging.warning(
251+
f"No TOSA file found in {args.intermediates} for compression evaluation"
252+
)
253+
254+
# Add user-specified evaluators
255+
for evaluator_name in args.evaluators:
256+
evaluator: Evaluator
257+
match evaluator_name:
258+
case "numerical":
259+
evaluator = NumericalModelEvaluator(
260+
model_name,
261+
ref_model,
262+
eval_model,
263+
example_inputs,
264+
eval_dtype=_DTYPE_MAP.get(args.dtype, None),
265+
)
266+
case "imagenet":
267+
evaluator = ImageNetEvaluator(
268+
model_name,
269+
eval_model,
270+
batch_size=args.batch_size,
271+
validation_dataset_path=args.evaluation_dataset,
272+
eval_dtype=_DTYPE_MAP.get(args.dtype, None),
273+
)
274+
case _:
275+
raise AssertionError(f"Unknown evaluator {evaluator_name}")
276+
evaluators.append(evaluator)
277+
278+
# Run evaluators
279+
metrics: dict[str, Any] = {}
280+
for evaluator in evaluators:
281+
result = evaluator.evaluate()
282+
metrics |= result
283+
284+
return metrics
285+
286+
287+
def main() -> None:
288+
try:
289+
args = _get_args()
290+
except ValueError as e:
291+
logging.error(f"Argument error: {e}")
292+
sys.exit(1)
293+
294+
# if we have custom ops, register them before processing the model
295+
if args.so_library is not None:
296+
logging.info(f"Loading custom ops from {args.so_library}")
297+
torch.ops.load_library(args.so_library)
298+
299+
# Get the model and its example inputs
300+
original_model, example_inputs = get_model_and_inputs_from_name(
301+
args.model_name, None
302+
)
303+
304+
# Use original model as reference to compare against
305+
ref_model = original_model.eval()
306+
eval_model = ref_model
307+
eval_inputs = example_inputs
308+
309+
# Cast model and inputs to eval_dtype if specified
310+
if args.dtype is not None:
311+
eval_dtype = _DTYPE_MAP[args.dtype]
312+
eval_model = copy.deepcopy(original_model).to(eval_dtype).eval()
313+
eval_inputs = tuple(
314+
inp.to(eval_dtype) if isinstance(inp, torch.Tensor) else inp
315+
for inp in example_inputs
316+
)
317+
318+
# Export the model
319+
exported_program = torch.export.export(eval_model, eval_inputs)
320+
321+
model_name = os.path.basename(os.path.splitext(args.model_name)[0])
322+
if args.intermediates:
323+
os.makedirs(args.intermediates, exist_ok=True)
324+
325+
# We only support Python3.10 and above, so use a later pickle protocol
326+
torch.export.save(
327+
exported_program,
328+
f"{args.intermediates}/{model_name}_exported_program.pt2",
329+
pickle_protocol=5,
330+
)
331+
332+
compile_spec = _get_compile_spec(args)
333+
334+
# Quantize the model if requested
335+
if args.quant_mode is not None:
336+
calibration_samples = None
337+
if (
338+
"imagenet" in args.evaluators
339+
and args.calibration_data is not None
340+
and Path(args.calibration_data).is_dir()
341+
):
342+
calibration_samples = _build_imagenet_calibration_samples(
343+
args.calibration_data, CALIBRATION_MAX_SAMPLES
344+
)
345+
else:
346+
calibration_samples = load_calibration_samples(
347+
args.calibration_data, example_inputs
348+
)
349+
350+
match args.quant_mode:
351+
case "a16w8":
352+
quant_mode = QuantMode.A16W8
353+
case "int8":
354+
quant_mode = QuantMode.INT8
355+
case _:
356+
raise AssertionError(f"Unknown quantization mode: {args.quant_mode}")
357+
358+
eval_model, exported_program = quantize_model(
359+
exported_program.module(),
360+
eval_inputs,
361+
compile_spec,
362+
model_name,
363+
True,
364+
quant_mode,
365+
calibration_samples,
366+
)
367+
368+
# Delegate the model to Arm backend if requested
369+
if args.delegate:
370+
partitioner = create_partitioner(compile_spec)
371+
edge = to_edge_transform_and_lower(
372+
exported_program,
373+
partitioner=[partitioner],
374+
compile_config=EdgeCompileConfig(
375+
_check_ir_validity=False,
376+
),
377+
)
378+
exported_program = edge.exported_program()
379+
eval_model = exported_program.module()
380+
381+
dump_delegation_info(edge, args.intermediates)
382+
383+
# Evaluate the model
384+
metrics = _evaluate(args, model_name, ref_model, eval_model, example_inputs)
385+
386+
# Dump result as JSON
387+
output = {"name": model_name, "target": args.target, "metrics": metrics}
388+
with open(args.output, "w") as f:
389+
json.dump(output, f, indent=4)
390+
391+
392+
if __name__ == "__main__":
393+
main()

0 commit comments

Comments
 (0)