Multi-Query-Batch-Inference-Optimization/benchmark.py at main · dakshjain-1616/Multi-Query-Batch-Inference-Optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import time
import json
import threading
import statistics

BASE_URL = "http://localhost:8000"

def test_request(name, payload):
    start = time.time()
    try:
        r = requests.post(f"{BASE_URL}/generate", json=payload, timeout=180)
        dur = time.time() - start
        if r.status_code == 200:
            data = r.json()
            tps = data['metrics']['tps']
            print(f"[{name}] Success ({dur:.2f}s) | TPS: {tps:.2f} | Priority: {payload.get('priority', 'batch')}")
            return data
        else:
            print(f"[{name}] Fail {r.status_code}: {r.text}")
    except Exception as e:
        print(f"[{name}] Error: {e}")
    return None

def run_benchmarks():
    print("Waiting for server to be healthy...")
    for _ in range(30):
        try:
            if requests.get(f"{BASE_URL}/health").status_code == 200:
                print("Server is up!")
                break
        except:
            pass
        time.sleep(2)
    else:
        print("Server failed to start.")
        return

    results = {}

    # 1. Raw Output Test
    print("\n--- Testing Raw Output ---")
    raw_res = test_request("Raw Text", {
        "prompt": "Tell me a short joke about computers.",
        "format": "raw",
        "priority": "interactive"
    })
    if raw_res: results['raw'] = raw_res['metrics']['generation_time']

    # 2. Structured JSON Test (Generic)
    print("\n--- Testing Structured JSON (Generic) ---")
    json_res = test_request("JSON Generic", {
        "prompt": "Create a JSON object representing a person with name and age.",
        "format": "json",
        "priority": "interactive"
    })
    if json_res: results['json_generic'] = json_res['metrics']['generation_time']

    # 3. Structured JSON Test (Schema)
    print("\n--- Testing Structured JSON (Schema) ---")
    schema_res = test_request("JSON Schema", {
        "prompt": "List 3 common programming languages with their creators.",
        "format": "json",
        "json_schema": {
            "type": "object",
            "properties": {
                "languages": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "creator": {"type": "string"}
                        },
                        "required": ["name", "creator"]
                    }
                }
            },
            "required": ["languages"]
        },
        "priority": "batch"
    })
    if schema_res:
        results['json_schema'] = schema_res['metrics']['generation_time']
        print(f"Structured Data: {json.dumps(schema_res['structured_data'], indent=2)}")

    # 4. Latency Overhead Analysis
    if 'raw' in results and 'json_generic' in results:
        overhead = (results['json_generic'] - results['raw']) / results['raw'] * 100
        print(f"\nLatency Overhead (JSON vs Raw): {overhead:.2f}%")

    # 5. Concurrent Requests Verification
    print("\n--- Testing Concurrent Priority/Preemption ---")
    def run_concurrent():
        threads = []
        # One long batch job
        t_batch = threading.Thread(target=test_request, args=("Batch Job", {
            "prompt": "Write a 200 word essay on the history of AI.",
            "priority": "batch",
            "max_tokens": 256
        }))
        threads.append(t_batch)

        # One high-priority interactive job after a slight delay
        def high_p():
            time.sleep(1)
            test_request("Interactive Job", {
                "prompt": "What is 2+2?",
                "priority": "interactive",
                "max_tokens": 10
            })

        t_inter = threading.Thread(target=high_p)
        threads.append(t_inter)

        for t in threads: t.start()
        for t in threads: t.join()

    run_concurrent()

if __name__ == "__main__":
    run_benchmarks()