|
| 1 | +""" |
| 2 | +Gemma 4 31B-it: simple text Q&A on XPU (OpenAI /v1), same shape as |
| 3 | +``test_gemma_4_e2b.py``. |
| 4 | +
|
| 5 | +Model card: https://huggingface.co/google/gemma-4-31B-it |
| 6 | +
|
| 7 | + - XPU test runs when Intel XPU is available. |
| 8 | + - Requires tp=4 on 4x Arc Pro B60 (24 GB). Memory-tightening block applied |
| 9 | + per smoke-test-setup skill section 3. |
| 10 | +
|
| 11 | +Run from test/srt:: |
| 12 | +
|
| 13 | + python3 -m unittest xpu.test_gemma_4_31b.TestGemma431BitXPU.test_simple_code_qa |
| 14 | +
|
| 15 | +Appends to ``gemma_4_31b_comparison.txt`` in this directory. |
| 16 | +
|
| 17 | +Server is started with ``sglang serve`` (``--model-impl sglang``). |
| 18 | +""" |
| 19 | + |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +import os |
| 23 | +import re |
| 24 | +import unittest |
| 25 | +from datetime import datetime, timezone |
| 26 | +from pathlib import Path |
| 27 | + |
| 28 | +import openai |
| 29 | + |
| 30 | +from sglang.srt.utils.common import is_xpu |
| 31 | +from sglang.test.test_utils import CustomTestCase |
| 32 | +from sglang.test.vlm_utils import ( |
| 33 | + DEFAULT_URL_FOR_TEST, |
| 34 | + kill_process_tree, |
| 35 | + popen_launch_server, |
| 36 | +) |
| 37 | + |
| 38 | +MODEL = "google/gemma-4-31B-it" |
| 39 | + |
| 40 | +COMPARISON_LOG_PATH = Path(__file__).resolve().parent / "gemma_4_31b_comparison.txt" |
| 41 | +LAUNCH_TIMEOUT = 1200 # tp=4 weight-load + first-forward compile |
| 42 | + |
| 43 | + |
| 44 | +def _server_subprocess_env() -> dict: |
| 45 | + return { |
| 46 | + "TORCHDYNAMO_VERBOSE": "0", |
| 47 | + "TORCHINDUCTOR_VERBOSE": "0", |
| 48 | + "TORCH_COMPILE_DEBUG": "0", |
| 49 | + "TORCH_SHOW_CPP_STACKTRACES": "0", |
| 50 | + } |
| 51 | + |
| 52 | + |
| 53 | +def _prettify_spm_style_text(s: str) -> str: |
| 54 | + """Turn SentencePiece-style space/newline markers in API strings into normal text.""" |
| 55 | + if not s: |
| 56 | + return s |
| 57 | + return s.replace("Ċ", "\n").replace("Ġ", " ") |
| 58 | + |
| 59 | + |
| 60 | +def setUpModule(): |
| 61 | + COMPARISON_LOG_PATH.write_text( |
| 62 | + "Gemma-4-31B-it — device comparison log\n" |
| 63 | + f"Model: {MODEL}\n" |
| 64 | + f"Run started (UTC): {datetime.now(timezone.utc).isoformat()}\n" |
| 65 | + f"{'=' * 80}\n\n", |
| 66 | + encoding="utf-8", |
| 67 | + ) |
| 68 | + |
| 69 | + |
| 70 | +def _append_comparison_log( |
| 71 | + *, |
| 72 | + title: str, |
| 73 | + device_cli: str, |
| 74 | + extra_server_notes: str, |
| 75 | + user_prompt: str, |
| 76 | + response, |
| 77 | +) -> None: |
| 78 | + msg = response.choices[0].message |
| 79 | + content = _prettify_spm_style_text(msg.content or "") |
| 80 | + reasoning = _prettify_spm_style_text(getattr(msg, "reasoning_content", None) or "") |
| 81 | + usage = response.usage |
| 82 | + block = ( |
| 83 | + f"\n{'#' * 80}\n" |
| 84 | + f"{title}\n" |
| 85 | + f"Server device flag: {device_cli}\n" |
| 86 | + f"{extra_server_notes}\n" |
| 87 | + f"{'#' * 80}\n" |
| 88 | + f"--- user prompt ---\n{user_prompt}\n" |
| 89 | + f"--- assistant message.content ---\n{content}\n" |
| 90 | + f"--- assistant message.reasoning_content (if any) ---\n{reasoning}\n" |
| 91 | + f"--- usage ---\n" |
| 92 | + f" prompt_tokens: {getattr(usage, 'prompt_tokens', None)}\n" |
| 93 | + f" completion_tokens: {getattr(usage, 'completion_tokens', None)}\n" |
| 94 | + f" total_tokens: {getattr(usage, 'total_tokens', None)}\n" |
| 95 | + f"{'=' * 80}\n" |
| 96 | + ) |
| 97 | + with COMPARISON_LOG_PATH.open("a", encoding="utf-8") as f: |
| 98 | + f.write(block) |
| 99 | + |
| 100 | + |
| 101 | +# Gemma 4 tokenizer does not ship a chat_template; reuse the Gemma-family Jinja. |
| 102 | +_CHAT_TEMPLATE_PATH = str( |
| 103 | + Path(__file__).resolve().parent / "gemma4_chat_template.jinja" |
| 104 | +) |
| 105 | + |
| 106 | +# 31B-it: tp=4 on Arc Pro B60 (24 GB each) with memory-tightening block. |
| 107 | +XPU_SERVER_ARGS = [ |
| 108 | + "--device", |
| 109 | + "xpu", |
| 110 | + "--tp=4", |
| 111 | + "--trust-remote-code", |
| 112 | + "--disable-overlap-schedule", |
| 113 | + "--page-size", |
| 114 | + "64", |
| 115 | + "--attention-backend", |
| 116 | + "intel_xpu", |
| 117 | + "--model-impl", |
| 118 | + "sglang", |
| 119 | + "--chat-template", |
| 120 | + _CHAT_TEMPLATE_PATH, |
| 121 | + "--mem-fraction-static", |
| 122 | + "0.92", |
| 123 | + "--context-length", |
| 124 | + "8192", |
| 125 | + "--chunked-prefill-size", |
| 126 | + "1024", |
| 127 | + "--max-running-requests", |
| 128 | + "8", |
| 129 | + "--cuda-graph-max-bs", |
| 130 | + "8", |
| 131 | +] |
| 132 | + |
| 133 | +_SIMPLE_CODE_PROMPT = ( |
| 134 | + "Write a minimal Python function `def add(a, b):` that returns a+b. " |
| 135 | + "Reply with only the function, give a brief explanation. " |
| 136 | + "Finish with asking me How can I help you today?" |
| 137 | +) |
| 138 | + |
| 139 | + |
| 140 | +def _simple_text_messages(): |
| 141 | + return [ |
| 142 | + { |
| 143 | + "role": "user", |
| 144 | + "content": [ |
| 145 | + {"type": "text", "text": _SIMPLE_CODE_PROMPT}, |
| 146 | + ], |
| 147 | + } |
| 148 | + ] |
| 149 | + |
| 150 | + |
| 151 | +def _compact_code_text(s: str) -> str: |
| 152 | + t = s.replace("Ġ", " ").replace("Ċ", "\n") |
| 153 | + return re.sub(r"\s+", "", t.lower()) |
| 154 | + |
| 155 | + |
| 156 | +def _assert_code_reply(response): |
| 157 | + assert response.choices[0].message.role == "assistant" |
| 158 | + msg = response.choices[0].message |
| 159 | + text = msg.content or "" |
| 160 | + reasoning = getattr(msg, "reasoning_content", None) or "" |
| 161 | + combined = f"{text} {reasoning}".strip() |
| 162 | + assert len(combined) > 0 |
| 163 | + lower = combined.lower() |
| 164 | + assert ( |
| 165 | + "def" in lower and "add" in lower |
| 166 | + ), f"expected a Python `def add` in reply, got: {combined!r}" |
| 167 | + assert "return" in lower, f"expected `return` in reply, got: {combined!r}" |
| 168 | + compact = _compact_code_text(combined) |
| 169 | + assert ( |
| 170 | + "a+b" in compact |
| 171 | + ), f"expected `a+b` (allowing spaces) in reply, got: {combined!r}" |
| 172 | + assert response.usage is not None |
| 173 | + assert response.usage.completion_tokens > 0 |
| 174 | + |
| 175 | + |
| 176 | +@unittest.skipUnless(is_xpu(), "Intel XPU not available") |
| 177 | +class TestGemma431BitXPU(CustomTestCase): |
| 178 | + @classmethod |
| 179 | + def setUpClass(cls): |
| 180 | + cls.model = MODEL |
| 181 | + cls.base_url = DEFAULT_URL_FOR_TEST |
| 182 | + cls.api_key = "sk-123456" |
| 183 | + os.environ["SGLANG_USE_SGL_XPU"] = "1" |
| 184 | + |
| 185 | + cls.process = popen_launch_server( |
| 186 | + cls.model, |
| 187 | + cls.base_url, |
| 188 | + timeout=LAUNCH_TIMEOUT, |
| 189 | + api_key=cls.api_key, |
| 190 | + other_args=list(XPU_SERVER_ARGS), |
| 191 | + device="cuda", |
| 192 | + env=_server_subprocess_env(), |
| 193 | + ) |
| 194 | + cls.base_url += "/v1" |
| 195 | + |
| 196 | + @classmethod |
| 197 | + def tearDownClass(cls): |
| 198 | + kill_process_tree(cls.process.pid) |
| 199 | + |
| 200 | + def test_simple_code_qa(self): |
| 201 | + client = openai.Client(api_key=self.api_key, base_url=self.base_url) |
| 202 | + response = client.chat.completions.create( |
| 203 | + model="default", |
| 204 | + messages=_simple_text_messages(), |
| 205 | + temperature=0, |
| 206 | + max_tokens=96, |
| 207 | + ) |
| 208 | + _assert_code_reply(response) |
| 209 | + _append_comparison_log( |
| 210 | + title="OUTPUT FROM --device XPU (Gemma-4-31B-it)", |
| 211 | + device_cli="--device xpu", |
| 212 | + extra_server_notes="SGLANG_USE_SGL_XPU=1; tp=4; intel_xpu attention backend; mem-fraction-static=0.92; context-length=8192.", |
| 213 | + user_prompt=_SIMPLE_CODE_PROMPT, |
| 214 | + response=response, |
| 215 | + ) |
| 216 | + |
| 217 | + |
| 218 | +if __name__ == "__main__": |
| 219 | + unittest.main() |
0 commit comments