Support long context dataset accuracy measurement (#230)

Lumosis · web-flow · commit 351462ec2e8c · 2025-03-25T14:33:54.000-07:00
diff --git a/Makefile b/Makefile
@@ -51,4 +51,4 @@ unit-tests:
 	coverage run -m unittest -v
 
 check-test-coverage:
-	coverage report -m --omit="jetstream/core/proto/*,jetstream/engine/tokenizer_pb2.py,jetstream/external_tokenizers/*,benchmarks/benchmark_serving.py,benchmarks/eval_accuracy.py,benchmarks/eval_accuracy_mmlu.py,benchmarks/math_utils.py" --fail-under=96
+	coverage report -m --omit="jetstream/core/proto/*,jetstream/engine/tokenizer_pb2.py,jetstream/external_tokenizers/*,benchmarks/benchmark_serving.py,benchmarks/eval_accuracy.py,benchmarks/eval_accuracy_mmlu.py,benchmarks/eval_accuracy_longcontext.py,benchmarks/math_utils.py" --fail-under=96
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -71,6 +71,7 @@
 
 from benchmarks.eval_accuracy import eval_accuracy
 from benchmarks.eval_accuracy_mmlu import eval_accuracy_mmlu
+from benchmarks.eval_accuracy_longcontext import eval_accuracy_longcontext
 from benchmarks.metrics import CounterMetric, EventMetric
 import grpc
 from jetstream.core.proto import jetstream_pb2
@@ -166,6 +167,7 @@ class InputRequest:
   output: str = ""
   output_len: int = 0
   sample_idx: int = -1
+  metric: str = ""
 
 
 @dataclass
@@ -187,10 +189,12 @@ def to_dict(self):
       prompt = self.input_request.prompt
       original_output = self.input_request.output
       sample_idx = self.input_request.sample_idx
+      metric = self.input_request.metric
     else:
       prompt = None
       original_output = None
       sample_idx = None
+      metric = None
     return {
         "prompt": prompt,
         "original_output": original_output,
@@ -201,6 +205,7 @@ def to_dict(self):
         "ttst_sec": self.ttst_sec,
         "prompt_len": self.prompt_len,
         "sample_idx": sample_idx,
+        "metric": metric,
     }
 
 
@@ -282,17 +287,19 @@ def load_openorca_dataset_pkl(
 
 def load_longcontext_dataset_pkl(
     dataset_path: str,
-) -> list[tuple[Any, Any]]:
+) -> tuple[list[tuple[Any, Any]], list]:
   assert os.path.isfile(dataset_path)
 
   # read pickle file
   data = pandas.read_pickle(dataset_path)
 
   samples = []
+  metrics = []
   for _, row in data.iterrows():
-    samples.append((row["input"], row["ref_output"]))
+    samples.append((row["input"], row["gt_output"]))
+    metrics.append(row["metric"])
 
-  return samples
+  return samples, metrics
 
 
 def load_mmlu_dataset_csv(dataset_path: str) -> tuple[Any, dict[str, str]]:
@@ -421,7 +428,6 @@ def filter_dataset(
     tokenized_dataset: list[tuple[str, Any, str, int, int, int]],
     dataset_type: str,
     max_output_length: int = 0,
-    run_mmlu_dataset: bool = False,
     min_input_length: int = 4,
     max_input_length: int = 0,
     max_target_length: int = 0,
@@ -443,7 +449,8 @@ def filter_dataset(
       sample_idx,
   ) in tokenized_dataset:
     if prompt_len < min_input_length or (
-        not (run_mmlu_dataset or dataset_type == "math500") and output_len < 4
+        not (dataset_type == "mmlu" or dataset_type == "math500")
+        and output_len < 4
     ):
       # Prune too short sequences.
       # This is because TGI causes errors when the input or output length
@@ -479,11 +486,11 @@ def sample_requests(
     dataset_type: str,
     max_output_length: int = 0,
     oversample_multiplier: float = 1.2,
-    run_mmlu_dataset: bool = False,
     min_input_length: int = 4,
     max_input_length: int = 0,
     max_target_length: int = 0,
     max_output_multiplier: int = 0,
+    metrics: Optional[list[str]] = None,
 ) -> list[InputRequest]:
 
   # Original dataset size
@@ -521,13 +528,16 @@ def sample_requests(
       tokenized_dataset,
       dataset_type,
       max_output_length,
-      run_mmlu_dataset,
       min_input_length,
       max_input_length,
       max_target_length,
       max_output_multiplier,
   )
 
+  if metrics is not None:
+    for request in input_requests:
+      request.metric = metrics[request.sample_idx]
+
   # Sample the requests.
   if len(input_requests) > num_requests:
     input_requests = random.sample(input_requests, num_requests)
@@ -1068,11 +1078,6 @@ def parse_args() -> argparse.Namespace:
       choices=["HELM", "Harness", ""],
       help="mmlu method/format to generate shots",
   )
-  parser.add_argument(
-      "--run-mmlu-dataset",
-      action="store_true",
-      help="specify if it's for mmlu dataset",
-  )
   return parser.parse_args()
 
 
@@ -1094,6 +1099,7 @@ def main(args: argparse.Namespace):
   tokenizer = get_tokenizer(
       model_id, tokenizer_id, use_hf_tokenizer, hf_access_token
   )
+  metrics = None
   if tokenizer == "test" or args.dataset == "test":
     input_requests = mock_requests(
         args.total_mock_requests
@@ -1116,7 +1122,7 @@ def main(args: argparse.Namespace):
           args.dataset_path,
       )
     elif args.dataset == "longcontext":
-      dataset = load_longcontext_dataset_pkl(
+      dataset, metrics = load_longcontext_dataset_pkl(
           args.dataset_path,
       )
     else:
@@ -1134,11 +1140,11 @@ def main(args: argparse.Namespace):
         num_requests=args.num_prompts,
         dataset_type=args.dataset,
         max_output_length=args.max_output_length,
-        run_mmlu_dataset=args.run_mmlu_dataset,
         min_input_length=args.min_input_length,
         max_input_length=args.max_input_length,
         max_target_length=args.max_target_length,
         max_output_multiplier=args.max_output_multiplier,
+        metrics=metrics,
     )
 
   warmup_requests = None
@@ -1184,8 +1190,10 @@ def main(args: argparse.Namespace):
   # Process output
   output = [output.to_dict() for output in request_outputs]
   if args.run_eval:
-    if args.run_mmlu_dataset:
+    if args.dataset == "mmlu":
       eval_json = eval_accuracy_mmlu(output)
+    elif args.dataset == "longcontext":
+      eval_json = eval_accuracy_longcontext(output)
     else:
       eval_json = eval_accuracy(output, args.dataset[:4])
 
diff --git a/benchmarks/eval_accuracy_longcontext.py b/benchmarks/eval_accuracy_longcontext.py
@@ -0,0 +1,171 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Evaluate accuracy of JetStream online serving only for long context dataset.
+"""
+
+import argparse
+import nltk
+from tqdm import tqdm
+import pandas as pd
+import json
+import re
+from multiprocessing import Pool, cpu_count
+import numpy as np
+from rouge_score import rouge_scorer
+
+
+scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+
+
+def rouge(label, pred):
+  """Returns the ROUGE-L score based on the label and the prediction"""
+  score = scorer.score(label, pred)
+  return {
+      "rougeL": 100 * score["rougeL"].fmeasure,
+  }
+
+
+def niah_em(label, pred):
+  """
+  Returns the NIAH (needle in a haystack) score based on the label and
+  the prediction.
+  Reference: https://github.com/gkamradt/LLMTest_NeedleInAHaystack
+  """
+  label_uuids = re.findall(r"[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}", label)
+  pred_uuids = re.findall(r"[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}", pred)
+
+  if len(pred_uuids) == 0:
+    return {"exact_match": 0.0}
+
+  # https://github.com/hsiehjackson/RULER/blob/main/scripts/eval/synthetic/constants.py#L28
+  score = (
+      sum(
+          [
+              sum([1.0 if r.lower() in pred.lower() else 0.0 for r in ref])
+              / len(ref)
+              for pred, ref in zip(pred_uuids, label_uuids)
+          ]
+      )
+      / len(pred_uuids)
+      * 100
+  )
+
+  return {"exact_match": round(score, 2)}
+
+
+def qa_em(label, pred):
+  """
+  Returns the QA score based on the label and the prediction.
+  Reference: https://github.com/mlcommons/inference/blob/master/\
+    language/llama3.1-405b/evaluate-accuracy.py#L69
+  """
+  answer_substring = pred
+
+  if "Answer: " in pred:
+    last_answer_index = pred.rfind("Answer: ")
+    if last_answer_index == -1:
+      return {"exact_match": 0.0}
+
+    answer_substring = pred[last_answer_index + len("Answer: ") :]
+
+  if answer_substring in label:
+    return {"exact_match": 100.0}
+
+  normalized_answer = re.sub(r"\s+", "", answer_substring).lower()
+  label_entries = [
+      re.sub(r"\s+", "", entry).lower() for entry in label.split("|")
+  ]
+
+  match_found = any(entry in normalized_answer for entry in label_entries)
+  return {"exact_match": 100.0 if match_found else 0.0}
+
+
+def postprocess_text(preds, targets):
+  preds = [pred.strip() for pred in preds]
+  targets = [target.strip() for target in targets]
+
+  # rougeLSum expects newline after each sentence
+  preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+  targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
+
+  return preds, targets
+
+
+def process_item(item):
+  pred, target, metric = item
+  if metric == "rouge":
+    metric_eval = rouge(target, pred)
+  elif metric == "niah_em":
+    metric_eval = niah_em(target, pred)
+  elif metric == "qa_em":
+    metric_eval = qa_em(target, pred)
+  else:
+    raise ValueError(f"Unknown metric: {metric}")
+  return metric_eval
+
+
+def run_evaluation(preds, targets, target_metrics, n_process=None):
+  n_process = cpu_count() if n_process is None else n_process
+  with Pool(n_process) as pool:
+    accuracies = list(
+        tqdm(
+            pool.imap(process_item, zip(preds, targets, target_metrics)),
+            total=len(preds),
+        )
+    )
+  df = pd.DataFrame({"accuracy": accuracies, "metric": target_metrics})
+  return df.accuracy.apply(pd.Series).describe().loc["mean"].to_dict()
+
+
+def eval_accuracy_longcontext(request_outputs_dict):
+  nltk.download("punkt")
+  preds = []
+  targets = []
+  target_metrics = []
+  for output in request_outputs_dict:
+    preds.append(output["generated_text"])
+    targets.append(output["original_output"])
+    target_metrics.append(output["metric"])
+  preds, targets = postprocess_text(preds, targets)
+  result = run_evaluation(preds, targets, target_metrics)
+  result = dict(result)
+  prediction_lens = [len(pred) for pred in preds]
+  result["gen_len"] = int(np.sum(prediction_lens))
+  result["gen_num"] = len(preds)
+  print("\nResults\n")
+  print(result)
+  return result
+
+
+def main(args):
+  with open(args.output_path, "r", encoding="utf-8") as f:
+    request_outputs_dict = json.load(f)
+
+  eval_accuracy_longcontext(request_outputs_dict)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--output_path",
+      type=str,
+      default="/tmp/request-outputs.json",
+      help="File path which has original_output and inference generated_text.",
+  )
+
+  parsed_args = parser.parse_args()
+
+  main(parsed_args)
diff --git a/benchmarks/requirements.in b/benchmarks/requirements.in
@@ -1,4 +1,4 @@
-nltk
+nltk==3.8.1
 evaluate
 rouge-score
 transformers
diff --git a/benchmarks/tests/test_eval_accuracy.py b/benchmarks/tests/test_eval_accuracy.py
@@ -0,0 +1,33 @@
+"""Tests for long context accuracy measurement."""
+
+import unittest
+
+from benchmarks.eval_accuracy_longcontext import eval_accuracy_longcontext
+
+
+class TestEvalAccuracy(unittest.TestCase):
+  """Tests for long context accuracy measurement."""
+
+  def setUp(self):
+    self._request_outputs_dict = [
+        {"generated_text": "abc", "original_output": "abc", "metric": "rouge"},
+        {"generated_text": "abc", "original_output": "abc", "metric": "rouge"},
+        {"generated_text": "abc", "original_output": "abc", "metric": "qa_em"},
+        {"generated_text": "abc", "original_output": "abc", "metric": "qa_em"},
+        {
+            "generated_text": "abc",
+            "original_output": "abc",
+            "metric": "niah_em",
+        },
+        {
+            "generated_text": "abc",
+            "original_output": "abc",
+            "metric": "niah_em",
+        },
+    ]
+
+  def test_eval_accuracy_longcontext(self):
+    self.assertEqual(
+        eval_accuracy_longcontext(self._request_outputs_dict),
+        {"rougeL": 100.0, "exact_match": 50.0, "gen_len": 18, "gen_num": 6},
+    )

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-nltk`
	`1`	`+nltk==3.8.1`
`2`	`2`	`evaluate`
`3`	`3`	`rouge-score`
`4`	`4`	`transformers`