Model-Optimizer/examples/windows/accuracy_benchmark/perplexity_metrics/run_perplexity.py at main · kvcache-ai/Model-Optimizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import sys

import pandas as pd

# Ensure this directory is on sys.path for local imports
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
if SCRIPT_DIR not in sys.path:
    sys.path.insert(0, SCRIPT_DIR)
from perplexity_metrics import calculate_perplexity_hf, perplexity_eval  # noqa: E402


def run_perplexity_on_models(
    model_dirs,
    output_file="perplexity_results.csv",
    i="1024",
    chunk_size=None,
    hf_model=None,
    hf_device="cuda",
    hf_dtype=None,
):
    """
    Run perplexity evaluation on multiple ONNX Runtime GenAI models and/or a HuggingFace model.

    This function evaluates one or more models at different input sequence lengths,
    saves results to a CSV file, and prints a summary report. Each model-length
    combination is evaluated independently, with errors handled gracefully.

    Args:
        model_dirs (list[str]): List of model directory paths to evaluate.
                               Each directory must contain a valid ONNX Runtime GenAI model.
        output_file (str, optional): Path for the output CSV file containing results.
                                    Defaults to "perplexity_results.csv".
        i (str or list, optional): Input sequence lengths to evaluate. Can be:
                                  - String: comma-separated values (e.g., "1024,2048,4096")
                                  - List/tuple: sequence of integers
                                  - Single int: one length to evaluate
                                  Defaults to "1024".
        chunk_size (int, optional): Prefill chunk size for KV cache chunking.
                                   Required for input lengths > 1024.
                                   Overrides chunk_size in model config if provided.
                                   Defaults to None.
        hf_model (str, optional): HuggingFace model name or path to evaluate.
                                 If provided, will download and evaluate this model.
                                 Defaults to None.
        hf_device (str, optional): Device to run HuggingFace model on.
                                  Defaults to "cuda".
        hf_dtype (str, optional): Data type for HuggingFace model.
                                 Options: "float16", "bfloat16", "float32".
                                 Defaults to None (uses model default).

    Returns:
        pd.DataFrame: DataFrame containing evaluation results with columns:
                     - Model Path: Full path to model directory
                     - Model Type: "ONNX" or "HuggingFace"
                     - Input Length: Sequence length used for evaluation
                     - Perplexity: Computed perplexity score (or "N/A" if failed)
                     - Status: "Success" or "Failed"
                     - Error: Error message if failed, "None" if successful

    """
    results = []

    # Parse input lengths
    if isinstance(i, str):
        i_list = [int(x.strip()) for x in i.split(",") if x.strip()]
    elif isinstance(i, (list, tuple)):
        i_list = [int(x) for x in i]
    else:
        i_list = [int(i)]

    # Evaluate HuggingFace model if provided
    if hf_model is not None:
        print(f"\n{'=' * 60}")
        print(f"Evaluating HuggingFace model: {hf_model}")
        print(f"{'=' * 60}")

        # Convert dtype string to torch dtype
        import torch

        dtype_map = {
            "float16": torch.float16,
            "bfloat16": torch.bfloat16,
            "float32": torch.float32,
            "fp16": torch.float16,
            "bf16": torch.bfloat16,
            "fp32": torch.float32,
        }
        torch_dtype = dtype_map.get(hf_dtype.lower()) if hf_dtype else torch.float16

        for input_len in i_list:
            try:
                print(f"  Evaluating with input length: {input_len}")
                if torch_dtype:
                    print(f"  Using dtype: {torch_dtype}")

                # Calculate stride (use chunk_size if provided, otherwise use half of input_len)
                stride = chunk_size if chunk_size is not None else input_len // 2

                if chunk_size is not None:
                    assert stride == chunk_size, (
                        f"For chunking case, stride must equal chunk_size. "
                        f"Got stride={stride}, chunk_size={chunk_size}"
                    )
                perplexity = calculate_perplexity_hf(
                    model_name_or_path=hf_model,
                    max_length=input_len,
                    stride=stride,
                    device=hf_device,
                    torch_dtype=torch_dtype,
                )

                results.append(
                    {
                        "Model Path": hf_model,
                        "Model Type": "HuggingFace",
                        "Input Length": int(input_len),
                        "Perplexity": float(perplexity),
                        "Status": "Success",
                        "Error": "None",
                    }
                )
            except Exception as e:  # noqa: PERF203
                print(f"  Error for input length {input_len}: {e!s}")
                results.append(
                    {
                        "Model Path": hf_model,
                        "Model Type": "HuggingFace",
                        "Input Length": int(input_len),
                        "Perplexity": "N/A",
                        "Status": "Failed",
                        "Error": str(e),
                    }
                )

        print(" HuggingFace perplexity evaluation completed")

        # Unload HuggingFace model from GPU memory before ONNX evaluation
        print("[CLEANUP] Unloading HuggingFace model from GPU memory...")
        import gc

        import torch

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
            print("[CLEANUP] GPU memory freed")

    # Evaluate ONNX models
    for model_dir in model_dirs:
        print(f"\n{'=' * 60}")
        print(f"Evaluating perplexity for: {model_dir}")
        print(f"{'=' * 60}")

        try:
            # Check if model directory exists
            if not os.path.exists(model_dir):
                print(f"Error: Model directory does not exist: {model_dir}")
                results.append(
                    {
                        "Model Path": model_dir,
                        "Perplexity": "N/A",
                        "Status": "Directory not found",
                        "Error": "Directory does not exist",
                    }
                )
                continue

            # Check if genai_config.json exists
            config_path = os.path.join(model_dir, "genai_config.json")
            if not os.path.exists(config_path):
                print(f"Error: genai_config.json not found in: {model_dir}")
                results.append(
                    {
                        "Model Path": model_dir,
                        "Model Type": "ONNX",
                        "Perplexity": "N/A",
                        "Status": "Invalid model format",
                        "Error": "genai_config.json not found",
                    }
                )
                continue

            # For each input length, run perplexity_eval and record results
            for input_len in i_list:
                try:
                    print(f"  Evaluating with input length: {input_len}")
                    if chunk_size is None:
                        print(
                            "  Note: input length is ignored unless chunk_size is set or "
                            "config.search.chunk_size is present."
                        )
                    if chunk_size is not None:
                        print(f"  Using chunk_size: {chunk_size}")
                        perplexity = perplexity_eval(model_dir, str(input_len), chunk_size)
                    else:
                        perplexity = perplexity_eval(model_dir, str(input_len))
                    results.append(
                        {
                            "Model Path": model_dir,
                            "Model Type": "ONNX",
                            "Input Length": int(input_len),
                            "Perplexity": float(perplexity),
                            "Status": "Success",
                            "Error": "None",
                        }
                    )
                except Exception as e:  # noqa: PERF203
                    print(f"  Error for input length {input_len}: {e!s}")
                    results.append(
                        {
                            "Model Path": model_dir,
                            "Model Type": "ONNX",
                            "Input Length": int(input_len),
                            "Perplexity": "N/A",
                            "Status": "Failed",
                            "Error": str(e),
                        }
                    )

            print(" Perplexity evaluation completed successfully")

        except Exception as e:
            print(f"Error during perplexity evaluation: {e!s}")
            results.append(
                {
                    "Model Path": model_dir,
                    "Model Type": "ONNX",
                    "Perplexity": "N/A",
                    "Status": "Failed",
                    "Error": str(e),
                }
            )

    # Create results DataFrame and save to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)

    print(f"\n{'=' * 60}")
    print(f"Results saved to: {output_file}")
    print(f"{'=' * 60}")

    # Print summary
    successful = df[df["Status"] == "Success"]
    failed = df[df["Status"] != "Success"]

    print("\nSummary:")
    print(f"  Successful evaluations: {len(successful)}")
    print(f"  Failed evaluations: {len(failed)}")

    if len(successful) > 0:
        print("\nPerplexity Results:")
        for _, row in successful.iterrows():
            print(
                f"  {os.path.basename(row['Model Path'])} [i={row.get('Input Length', '?')}]: "
                f"{row['Perplexity']:.4f}"
                if isinstance(row["Perplexity"], (int, float))
                else row["Perplexity"]
            )

    return df


def main():
    """
    Command-line entry point for perplexity evaluation.

    Parses command-line arguments and runs perplexity evaluation on specified
    ONNX Runtime GenAI models and/or HuggingFace models. Results are saved to a CSV file.

    Command-line Arguments:
        --models: One or more ONNX model directory paths (optional)
        --hf_model: HuggingFace model name or path (optional)
        --hf_device: Device for HuggingFace model (default: "cuda")
        --hf_dtype: Data type for HuggingFace model (default: None)
        --i: Comma-separated input sequence lengths (default: "1024")
        --output: Output CSV file path (default: "perplexity_results.csv")
        --chunk_size: Prefill chunk size for prefill chunking (optional)

    Examples:
        # Evaluate ONNX models
        $ python run_perplexity.py --models /path/to/model
        $ python run_perplexity.py --models /path/to/model1 /path/to/model2 \\
              --i 1024,2048,4096 --chunk_size 1024 --output results.csv

        # Evaluate HuggingFace model
        $ python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf --i 1024
        $ python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf \\
              --hf_dtype float16 --hf_device cuda --i 1024,2048

        # Evaluate both ONNX and HuggingFace models
        $ python run_perplexity.py --models /path/to/onnx_model \\
              --hf_model meta-llama/Llama-2-7b-hf --i 1024
    """
    parser = argparse.ArgumentParser(
        description="Run perplexity evaluation on ONNX Runtime GenAI and/or HuggingFace models"
    )
    parser.add_argument(
        "--models",
        nargs="+",
        default=[],
        help="List of ONNX model directory paths to evaluate (optional)",
    )
    parser.add_argument(
        "--i",
        default="1024",
        help="Comma-separated input seq lengths to be evaluated (e.g. 1024,2048) please enter number >= 1024",
    )
    parser.add_argument(
        "--output",
        default="perplexity_results.csv",
        help="Output CSV file name (default: perplexity_results.csv)",
    )
    parser.add_argument(
        "--chunk_size",
        type=int,
        default=None,
        help="Chunk size for KV caching optimization (optional)",
    )
    parser.add_argument(
        "--hf_model",
        type=str,
        default=None,
        help="HuggingFace model name or path to evaluate (e.g., 'meta-llama/Llama-2-7b-hf')",
    )
    parser.add_argument(
        "--hf_device",
        type=str,
        default="cuda",
        help="Device to run HuggingFace model on (default: 'cuda')",
    )
    parser.add_argument(
        "--hf_dtype",
        type=str,
        default=None,
        choices=["float16", "bfloat16", "float32", "fp16", "bf16", "fp32"],
        help="Data type for HuggingFace model (default: None, uses model default)",
    )

    args = parser.parse_args()

    # Validate that at least one model source is provided
    if not args.models and not args.hf_model:
        print("Error: You must provide either --models or --hf_model (or both)")
        parser.print_help()
        return

    # Validate that all model directories exist
    valid_models = []
    for model_dir in args.models:
        if os.path.exists(model_dir):
            valid_models.append(model_dir)
        else:
            print(f"Warning: Model directory does not exist: {model_dir}")

    # Count total models to evaluate
    total_models = len(valid_models) + (1 if args.hf_model else 0)

    print(f"Running perplexity evaluation on {total_models} model(s)...")
    if args.chunk_size is not None:
        print(f"Using chunk_size: {args.chunk_size}")

    run_perplexity_on_models(
        valid_models,
        args.output,
        args.i,
        args.chunk_size,
        args.hf_model,
        args.hf_device,
        args.hf_dtype,
    )


if __name__ == "__main__":
    main()