DataArc-SynData-Toolkit/configs/eval.yaml at main · DataArcTech/DataArc-SynData-Toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# DeepEval Post-Training Evaluation Configuration Example

# Dataset configuration
dataset:
  path: ./dataset/test.jsonl           # Test dataset with 'input' and 'output' fields

# Output configuration
output:
  dir: ./deepeval_results

# Post-trained model to evaluate
post_trained_model:
  path: ./checkpoints/sft/model       # Path to fine-tuned model
  device: cuda:0
  max_model_len: 4096
  gpu_memory_utilization: 0.8

# Judge model for G-Eval (uses DeepEval built-in API)
# Reads OPENAI_API_KEY and OPENAI_BASE_URL from .env
judge_model: gpt-4.1

# Inference settings
inference:
  temperature: 0.0
  max_tokens: 2048
  output_instruction: "The answer should contain reasoning process step by step. Output your final answer after #####."    # Model output instruction

# ============ Metrics Configuration ============
# All metrics are enabled by default with sensible defaults.
# Customize below as needed.

# 1. Answer Correctness - Compares model output vs ground truth using G-Eval
correctness:
  enabled: true
  threshold: 0.7                      # Passing threshold (0-1 scale)
  criteria: "Determine whether the actual output is factually correct based on the expected output."
  evaluation_steps:
    - "Compare actual output directly with expected output for factual accuracy"
    - "Check if all key elements from expected output are present in actual output"
    - "Assess discrepancies in details, values, or information"
    - "Penalize factual errors and significant omissions"
  rubric:
    - score_range: [0, 2]
      expected_outcome: "Factually incorrect or completely wrong answer"
    - score_range: [3, 5]
      expected_outcome: "Partially correct with significant errors or omissions"
    - score_range: [6, 8]
      expected_outcome: "Mostly correct with minor errors or missing details"
    - score_range: [9, 10]
      expected_outcome: "Fully correct and complete answer"

# 2. Format Compliance - Checks if output follows the output_instruction format
format_compliance:
  enabled: true
  threshold: 0.8                      # Passing threshold (0-1 scale)
  criteria: "Evaluate whether the actual output follows the output instruction format."
  evaluation_steps:
    - "Identify the output format requirements from the output instruction"
    - "Check if the actual output structure matches the required format"
    - "Verify all required elements or sections are present"
    - "Assess proper formatting and syntax compliance"
  rubric:
    - score_range: [0, 2]
      expected_outcome: "Output does not follow the required format at all"
    - score_range: [3, 5]
      expected_outcome: "Output partially follows format but missing key elements"
    - score_range: [6, 8]
      expected_outcome: "Output follows format with minor deviations"
    - score_range: [9, 10]
      expected_outcome: "Output perfectly follows the required format"

# 3. Pairwise Preference - Compares base model vs post-trained model using G-Eval
#    Scoring: 10 = post-trained wins, 5 = tie, 0 = base wins
pairwise:
  enabled: true
  base_model:                         # Required when pairwise is enabled
    path: Qwen/Qwen2.5-7B             # Base model for comparison
    device: cuda:0
    max_model_len: 4096
    gpu_memory_utilization: 0.8
  criteria: "Compare two model responses. Response A (actual_output) is from post-trained model. Response B (expected_output) is from base model. IMPORTANT: Neither response is ground truth - you must independently verify correctness."
  evaluation_steps:
    - "Read the input question carefully and determine the correct answer yourself first."
    - "Check Response A (actual_output) for factual and logical correctness against your determined answer."
    - "Check Response B (expected_output) for factual and logical correctness against your determined answer."
    - "Check for hallucinations or unrelated content (e.g., irrelevant text, system prompts) - penalize heavily."
    - "If only one response is correct and consistent, that response wins. If both are wrong or inconsistent, output 5 (tie)."
    - "If both are correct and consistent, prefer the clearer, more concise response with less redundancy."
    - "Output 10 if Response A wins, 0 if Response B wins, or 5 if tie."