-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark.py
More file actions
122 lines (108 loc) · 3.91 KB
/
benchmark.py
File metadata and controls
122 lines (108 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import time
import json
import threading
import statistics
BASE_URL = "http://localhost:8000"
def test_request(name, payload):
start = time.time()
try:
r = requests.post(f"{BASE_URL}/generate", json=payload, timeout=180)
dur = time.time() - start
if r.status_code == 200:
data = r.json()
tps = data['metrics']['tps']
print(f"[{name}] Success ({dur:.2f}s) | TPS: {tps:.2f} | Priority: {payload.get('priority', 'batch')}")
return data
else:
print(f"[{name}] Fail {r.status_code}: {r.text}")
except Exception as e:
print(f"[{name}] Error: {e}")
return None
def run_benchmarks():
print("Waiting for server to be healthy...")
for _ in range(30):
try:
if requests.get(f"{BASE_URL}/health").status_code == 200:
print("Server is up!")
break
except:
pass
time.sleep(2)
else:
print("Server failed to start.")
return
results = {}
# 1. Raw Output Test
print("\n--- Testing Raw Output ---")
raw_res = test_request("Raw Text", {
"prompt": "Tell me a short joke about computers.",
"format": "raw",
"priority": "interactive"
})
if raw_res: results['raw'] = raw_res['metrics']['generation_time']
# 2. Structured JSON Test (Generic)
print("\n--- Testing Structured JSON (Generic) ---")
json_res = test_request("JSON Generic", {
"prompt": "Create a JSON object representing a person with name and age.",
"format": "json",
"priority": "interactive"
})
if json_res: results['json_generic'] = json_res['metrics']['generation_time']
# 3. Structured JSON Test (Schema)
print("\n--- Testing Structured JSON (Schema) ---")
schema_res = test_request("JSON Schema", {
"prompt": "List 3 common programming languages with their creators.",
"format": "json",
"json_schema": {
"type": "object",
"properties": {
"languages": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"creator": {"type": "string"}
},
"required": ["name", "creator"]
}
}
},
"required": ["languages"]
},
"priority": "batch"
})
if schema_res:
results['json_schema'] = schema_res['metrics']['generation_time']
print(f"Structured Data: {json.dumps(schema_res['structured_data'], indent=2)}")
# 4. Latency Overhead Analysis
if 'raw' in results and 'json_generic' in results:
overhead = (results['json_generic'] - results['raw']) / results['raw'] * 100
print(f"\nLatency Overhead (JSON vs Raw): {overhead:.2f}%")
# 5. Concurrent Requests Verification
print("\n--- Testing Concurrent Priority/Preemption ---")
def run_concurrent():
threads = []
# One long batch job
t_batch = threading.Thread(target=test_request, args=("Batch Job", {
"prompt": "Write a 200 word essay on the history of AI.",
"priority": "batch",
"max_tokens": 256
}))
threads.append(t_batch)
# One high-priority interactive job after a slight delay
def high_p():
time.sleep(1)
test_request("Interactive Job", {
"prompt": "What is 2+2?",
"priority": "interactive",
"max_tokens": 10
})
t_inter = threading.Thread(target=high_p)
threads.append(t_inter)
for t in threads: t.start()
for t in threads: t.join()
run_concurrent()
if __name__ == "__main__":
run_benchmarks()