diff --git a/.claude/skills/msgspec-patterns/SKILL.md b/.claude/skills/msgspec-patterns/SKILL.md new file mode 100644 index 00000000..3ded569a --- /dev/null +++ b/.claude/skills/msgspec-patterns/SKILL.md @@ -0,0 +1,412 @@ +--- +name: msgspec-patterns +description: Reference guide for msgspec.Struct usage patterns, performance tips, and gc=False safety analysis. Use when writing or reviewing msgspec Struct definitions, encoding/decoding code, or deciding whether gc=False is safe. +allowed-tools: Read, Grep, Glob +--- + +## Use Structs for Structured Data + +Always prefer `msgspec.Struct` over `dict`, `dataclasses`, or `attrs` for structured data with a known schema. Structs are 5-60x faster for common operations and are optimized for encoding/decoding. + +```python +# BAD +from dataclasses import dataclass + +@dataclass +class User: + name: str + email: str + age: int + +# GOOD +import msgspec + +class User(msgspec.Struct): + name: str + email: str + age: int + +user = User(name="alice", email="alice@example.com", age=30) +data = msgspec.json.encode(user) +decoded = msgspec.json.decode(data, type=User) +``` + +## Struct Configuration Options + +| Option | Description | Default | +| ----------------------- | --------------------------------------------- | -------- | +| `omit_defaults` | Omit fields with default values when encoding | `False` | +| `forbid_unknown_fields` | Error on unknown fields when decoding | `False` | +| `frozen` | Make instances immutable and hashable | `False` | +| `order` | Generate ordering methods (`__lt__`, etc.) | `False` | +| `eq` | Generate equality methods | `True` | +| `kw_only` | Make all fields keyword-only | `False` | +| `tag` | Enable tagged union support | `None` | +| `tag_field` | Field name for the tag | `"type"` | +| `rename` | Rename fields for encoding/decoding | `None` | +| `array_like` | Encode/decode as arrays instead of objects | `False` | +| `gc` | Enable garbage collector tracking | `True` | +| `weakref` | Enable weak reference support | `False` | +| `dict` | Add `__dict__` attribute | `False` | +| `cache_hash` | Cache the hash value | `False` | + +## Omit Default Values + +Set `omit_defaults=True` when default values are known on both encoding and decoding ends. Reduces encoded message size and improves performance. + +```python +class Config(msgspec.Struct, omit_defaults=True): + host: str = "localhost" + port: int = 8080 + debug: bool = False + +config = Config(host="production.example.com") +msgspec.json.encode(config) +# b'{"host":"production.example.com"}' — port and debug omitted +``` + +## Avoid Decoding Unused Fields + +Define smaller "view" Struct types that only contain the fields you actually need. msgspec skips decoding fields not defined in your Struct, reducing allocations and CPU time. + +```python +# BAD: decodes entire object +class FullTweet(msgspec.Struct): + id: int + full_text: str + user: dict + entities: dict + retweet_count: int + favorite_count: int + # ... many more fields + +# GOOD: only these fields are decoded, the rest is skipped +class User(msgspec.Struct): + name: str + +class TweetView(msgspec.Struct): + user: User + full_text: str + favorite_count: int +``` + +## array_like=True + +Set `array_like=True` when both ends know the field schema. Encodes structs as arrays instead of objects, removing field names from the message — smaller and faster. + +```python +class Point(msgspec.Struct, array_like=True): + x: float + y: float + z: float + +point = Point(1.0, 2.0, 3.0) +msgspec.json.encode(point) +# b'[1.0,2.0,3.0]' instead of b'{"x":1.0,"y":2.0,"z":3.0}' +``` + +## Tagged Unions + +Use `tag=True` on Struct types when handling multiple message types in a single union for efficient type discrimination during decoding. + +```python +class GetRequest(msgspec.Struct, tag=True): + key: str + +class PutRequest(msgspec.Struct, tag=True): + key: str + value: str + +class DeleteRequest(msgspec.Struct, tag=True): + key: str + +Request = GetRequest | PutRequest | DeleteRequest +decoder = msgspec.msgpack.Decoder(Request) + +data = msgspec.msgpack.encode(PutRequest(key="foo", value="bar")) +request = decoder.decode(data) + +match request: + case GetRequest(key): print(f"Get: {key}") + case PutRequest(key, value): print(f"Put: {key}={value}") + case DeleteRequest(key): print(f"Delete: {key}") +``` + +## Use encode_into for Buffer Reuse + +In hot loops, use `Encoder.encode_into()` with a pre-allocated `bytearray` instead of `encode()` to avoid allocating a new `bytes` object per call. Always measure before adopting. + +```python +# BAD: new bytes object allocated each iteration +encoder = msgspec.msgpack.Encoder() +for msg in msgs: + data = encoder.encode(msg) + socket.sendall(data) + +# GOOD: reuse a buffer +encoder = msgspec.msgpack.Encoder() +buffer = bytearray(1024) +for msg in msgs: + n = encoder.encode_into(msg, buffer) + socket.sendall(memoryview(buffer)[:n]) +``` + +## NDJSON with encode_into + +For line-delimited JSON, use `encode_into()` to avoid the copy from string concatenation: + +```python +encoder = msgspec.json.Encoder() +buffer = bytearray(64) +for msg in messages: + n = encoder.encode_into(msg, buffer) + file.write(memoryview(buffer)[:n]) + file.write(b"\n") +``` + +## Length-Prefix Framing + +Use `encode_into()` with an offset to efficiently prepend a message length without extra copies: + +```python +def send_length_prefixed(socket, msg): + encoder = msgspec.msgpack.Encoder() + buffer = bytearray(64) + n = encoder.encode_into(msg, buffer, 4) # leave 4 bytes at front + buffer[:4] = n.to_bytes(4, "big") + socket.sendall(memoryview(buffer)[:4 + n]) + +async def prefixed_send(stream, buffer: bytes) -> None: + stream.write(len(buffer).to_bytes(4, "big")) + stream.write(buffer) + await stream.drain() + +async def prefixed_recv(stream) -> bytes: + prefix = await stream.readexactly(4) + n = int.from_bytes(prefix, "big") + return await stream.readexactly(n) +``` + +## Use MessagePack for Internal APIs + +`msgspec.msgpack` is more compact and can be more performant than `msgspec.json` for internal service communication. + +```python +class Event(msgspec.Struct): + type: str + data: dict + timestamp: float + +encoder = msgspec.msgpack.Encoder() +decoder = msgspec.msgpack.Decoder(Event) +packed = encoder.encode(Event(type="login", data={"user_id": 123}, timestamp=1703424000.0)) +``` + +## TOML Configuration Files + +Use msgspec for parsing pyproject.toml and other TOML config files with validation: + +```python +class BuildSystem(msgspec.Struct, omit_defaults=True, rename="kebab"): + requires: list[str] = [] + build_backend: str | None = None + +class Project(msgspec.Struct, omit_defaults=True, rename="kebab"): + name: str | None = None + version: str | None = None + dependencies: list[str] = [] + +class PyProject(msgspec.Struct, omit_defaults=True, rename="kebab"): + build_system: BuildSystem | None = None + project: Project | None = None + tool: dict[str, dict[str, Any]] = {} + +def load_pyproject(path: str) -> PyProject: + with open(path, "rb") as f: + return msgspec.toml.decode(f.read(), type=PyProject) +``` + +--- + +## gc=False — Safety Analysis + +Setting `gc=False` on a Struct means instances are **never tracked** by Python's garbage collector. This reduces GC pressure (up to 75x less GC pause time, 16 bytes saved per instance). The **only** risk: if a **reference cycle** involves only `gc=False` structs, that cycle will **never be collected** — memory leak. + +Reference: [msgspec Structs – Disabling Garbage Collection](https://jcristharif.com/msgspec/structs.html#struct-gc) + +### When to use this analysis + +- Adding or modifying a class that inherits from `msgspec.Struct` +- Reviewing or refactoring code that defines or uses msgspec structs +- Deciding whether to add or remove `gc=False` on a Struct + +### Verified safety constraints + +All of the following must hold to use `gc=False` safely. + +**1. No reference cycles** + +- The struct (and any container it references) must never be part of a reference cycle. +- Multiple variables pointing to the same struct (`x = s; y = x`) are safe — that is not a cycle. A cycle is A → B → … → A. +- Returning a struct from a function is safe. What matters is whether any reference path leads back to the struct. + +**2. No mutation that could create cycles** + +- Do not mutate struct fields after construction in a way that could introduce a cycle (e.g. set a field to an object that references the struct, or append the struct to its own list/dict). +- Frozen structs (`frozen=True`) prevent field reassignment; `force_setattr` in `__post_init__` is one-time init only — acceptable. +- Assigning scalars (int, str, bool, float, None) to fields is always safe. + +**3. Mutable containers (list, dict, set) on the struct** + +- If the struct has list/dict/set fields, either: + - Never mutate those containers after creation and never store in them any object that references the struct, or + - Do not use `gc=False` (conservative). +- Reading from containers does not create cycles and is always allowed. + +**4. Nested structs** + +- If a struct holds another Struct (or containers that hold Structs), the same rules apply to the whole reference graph. No cycles, no mutation that could create cycles. + +**5. Generic / mixins** + +- With `gc=False`, the type must be compatible with `__slots__` (e.g. if using `Generic`, the mixin must define `__slots__ = ()`). See msgspec issue #631 / PR #635. + +### Decision tree + +``` +Should I use gc=False? +│ +├── Does your Struct only contain scalar types (int, float, str, bool, bytes)? +│ └── YES → SAFE +│ +├── Does your Struct contain lists/dicts and you control what goes in them? +│ └── Will you EVER put the struct itself (or a parent) into those containers? +│ ├── NO → Probably safe, but audit carefully +│ └── YES/MAYBE → Do NOT use gc=False +│ +├── Does your Struct reference another Struct of the same type (tree, linked list)? +│ └── YES → Do NOT use gc=False +│ +├── Is your Struct part of a bidirectional parent-child relationship? +│ └── YES → Do NOT use gc=False +│ +└── When in doubt → Do NOT use gc=False +``` + +### Examples + +```python +# SAFE: only scalar values +class Point(msgspec.Struct, gc=False): + x: float + y: float + z: float + +# SAFE: immutable tuple of scalars +class Package(msgspec.Struct, gc=False): + name: str + version: str + depends: tuple[str, ...] + size: int + +# UNSAFE: self-referential — do NOT use gc=False +class TreeNode(msgspec.Struct): # no gc=False + value: int + children: list["TreeNode"] + parent: "TreeNode | None" = None +``` + +### Real-world example: decoding large JSON + +```python +class Package(msgspec.Struct, gc=False): + build: str + build_number: int + depends: tuple[str, ...] # tuple, not list — immutable + md5: str + name: str + sha256: str + version: str + license: str = "" + size: int = 0 + timestamp: int = 0 + +class RepoData(msgspec.Struct, gc=False): + repodata_version: int + info: dict + packages: dict[str, Package] + removed: tuple[str, ...] + +decoder = msgspec.json.Decoder(RepoData) + +def load_repo_data(path: str) -> RepoData: + with open(path, "rb") as f: + return decoder.decode(f.read()) +``` + +### Checklist: can use gc=False + +- [ ] Struct and everything it references can never participate in a reference cycle. +- [ ] No mutation of struct fields after construction that could introduce a cycle (frozen or init-only mutation is ok; scalar assignment is ok). +- [ ] Any list/dict/set fields are never mutated after creation. +- [ ] No storing the struct (or anything that references it) inside its own container fields. +- [ ] If Generic/mixins are used, `__slots__` compatibility is satisfied. + +### Checklist: must NOT use gc=False + +- [ ] Struct is mutated after creation in a way that could create a cycle. +- [ ] Container fields are mutated after creation and could hold the struct or back-references. +- [ ] Struct is used in a pattern where it's stored in a container that the struct also references. + +### Per-struct analysis steps + +1. List all fields and their types (scalars vs containers vs nested Structs). +2. Search the codebase for: assignments to this struct's fields, mutations of its container fields (`.append`, `.update`, etc.), and any place the struct instance is stored in a list/dict that might be referenced by the struct. +3. If only scalars or immutable types, or frozen with no container mutation → likely safe. +4. If mutable containers and they're never mutated → likely safe; otherwise → do not use `gc=False`. + +### Risky structs: AT-RISK audit pattern + +A struct is **risky** for `gc=False` if it has a condition that would normally disallow it (e.g. a mutable dict field) but that condition never arises in practice (e.g. the field is only ever read). + +**Auditing a risky struct:** + +1. Identify the at-risk condition (e.g. "has `metadata: dict` that could be mutated"). +2. Search the codebase for all uses of that struct and of the at-risk field: + - Field assignment: `obj.field = ...`, `obj.field[key] = ...`, `obj.field.append(...)`, `obj.field.update(...)` + - Any code path that stores the struct (or something holding it) inside that container. +3. If the audit finds no such mutation or cycle-creating storage, `gc=False` is acceptable — **but add the AT-RISK marker** so future changes are re-audited. + +**When audit passes** — set `gc=False` and add: + +- A comment above the class stating why gc=False is used and when the audit was done: + `# gc=False: audit YYYY-MM: is only read, never mutated.` +- A docstring line signalling that changes must trigger re-audit: + `AT-RISK (gc=False): Has . Any change that must be audited; if so, remove gc=False.` + +```python +# gc=False: audit 2026-03: metadata dict is only ever read, never mutated after construction. +class QueryResult(msgspec.Struct, frozen=True, array_like=True, gc=False): + """Result of a completed inference query. + + AT-RISK (gc=False): Has mutable container field `metadata`. Any change that + mutates `metadata` after construction or stores this struct in a container + referenced by this struct must be audited; if so, remove gc=False. + """ + ... +``` + +**When touching an AT-RISK struct:** + +1. Re-run the audit searches above. +2. If your change mutates the at-risk field(s) or creates a cycle → remove `gc=False` and the AT-RISK comment. +3. If your change does not touch the at-risk field → existing `gc=False` and AT-RISK comment remain; optionally update the audit date. + +--- + +## References + +- [msgspec Structs](https://jcristharif.com/msgspec/structs.html) +- [msgspec Performance Tips](https://jcristharif.com/msgspec/perf-tips.html) +- [msgspec Structs – Disabling Garbage Collection](https://jcristharif.com/msgspec/structs.html#struct-gc) +- [msgspec #631 – Generic structs and gc=False](https://github.com/jcrist/msgspec/issues/631) diff --git a/.claude/skills/msgspec-struct-gc-check/SKILL.md b/.claude/skills/msgspec-struct-gc-check/SKILL.md new file mode 100644 index 00000000..1361e5ce --- /dev/null +++ b/.claude/skills/msgspec-struct-gc-check/SKILL.md @@ -0,0 +1,120 @@ +--- +name: msgspec-struct-gc-check +description: Check whether msgspec.Struct types can safely use gc=False. Use when adding or changing msgspec.Struct definitions, or when reviewing code that uses msgspec structs. +allowed-tools: Read, Grep, Glob +--- + +# msgspec.Struct gc=False Safety Check + +## When to use this skill + +- Adding or modifying a class that inherits from `msgspec.Struct` +- Reviewing or refactoring code that defines or uses msgspec structs +- Deciding whether to add or remove `gc=False` on a Struct + +## Why gc=False matters + +Setting `gc=False` on a Struct means instances are **never tracked** by Python's garbage collector. This reduces GC pressure and can improve performance when many structs are allocated. The **only** risk: if a **reference cycle** involves only gc=False structs (or objects not tracked by GC), that cycle will **never be collected** (memory leak). + +Reference: [msgspec Structs – Disabling Garbage Collection](https://jcristharif.com/msgspec/structs.html#struct-gc). + +## Verified safety constraints + +Use these constraints to decide if a Struct can use `gc=False`. All must hold. + +### 1. No reference cycles + +- The struct (and any container it references) must never be part of a reference cycle. +- **Multiple variables** pointing to the same struct (`x = s; y = x`) are **safe** — that is not a cycle. A cycle is A → B → … → A. +- **Returning** a struct from a function is **safe**. What matters is whether any reference path leads back to the struct (e.g. struct's list contains the struct or something that holds the struct). + +### 2. No mutation that could create cycles + +- **Do not mutate** struct fields after construction in a way that could introduce a cycle (e.g. set a field to an object that references the struct, or append the struct to its own list/dict). +- **Frozen structs** (`frozen=True`) prevent field reassignment; `force_setattr` in `__post_init__` is one-time init only, so that's acceptable. +- Assigning **scalars** (int, str, bool, float, None) to fields is safe — they cannot form cycles. + +### 3. Mutable containers (list, dict, set) on the struct + +- If the struct has list/dict/set fields, either: + - **Never mutate** those containers after creation (no `.append`, `.update`, `[...] = ...`, etc.), and never store in them any object that references the struct, or + - Do not use `gc=False` (conservative). +- **Reading** from containers (e.g. `x = struct.foobars[i]`) does not create cycles and is allowed. + +### 4. Nested structs + +- If a struct holds another Struct (or holds containers that hold Structs), the same rules apply to the whole reference graph: no cycles, no mutation that could create cycles. If any nested Struct uses `gc=False`, the whole graph must still be cycle-free. + +### 5. Generic / mixins + +- With `gc=False`, the type must be compatible with `__slots__` (e.g. if using `Generic`, the mixin must define `__slots__ = ()`). See msgspec issue #631 / PR #635. + +## Checklist for "can use gc=False" + +- [ ] Struct and everything it references can never participate in a reference cycle. +- [ ] No mutation of struct fields after construction that could introduce a cycle (frozen or init-only mutation is ok; scalar assignment is ok). +- [ ] Any list/dict/set fields are never mutated after creation, or we do not use gc=False. +- [ ] No storing the struct (or anything that references it) inside its own container fields. +- [ ] If Generic/mixins are used, `__slots__` compatibility is satisfied. + +## Checklist for "must NOT use gc=False" + +- [ ] Struct is mutated after creation in a way that could create a cycle (e.g. appending self to a list field). +- [ ] Container fields are mutated after creation and could hold the struct or back-references. +- [ ] Struct is used in a pattern where it's stored in a container that the struct (or its fields) also references. + +## Quick per-struct analysis steps + +1. List all fields and their types (scalars vs containers vs nested Structs). +2. Search the codebase for: assignments to this struct's fields, mutations of its container fields (`.append`, `.update`, etc.), and any place the struct instance is stored (e.g. in a list/dict that might be referenced by the struct). +3. If only scalars or immutable types, or frozen with no container mutation → likely safe for gc=False. +4. If mutable containers and they're never mutated (and never made to reference the struct) → likely safe; otherwise → do not use gc=False. + +## Risky structs: audit and at-risk comment + +A struct is **risky** for gc=False if it has a condition that would normally disallow gc=False (e.g. mutable list/dict/set fields), but that condition might never arise in practice (e.g. the field is only ever read, never mutated after construction). + +### Auditing a risky struct + +1. Identify the at-risk condition (e.g. "has `metadata: dict` that could be mutated"). +2. Search the codebase for all uses of that struct and of the at-risk field: + - Any assignment to the field: `obj.field = ...`, `obj.field[key] = ...`, `obj.field.append(...)`, `obj.field.update(...)`, etc. + - Any code path that could store the struct (or something holding it) inside that container. +3. If the audit finds **no** such mutation or cycle-creating storage, the condition never arises and gc=False is acceptable **provided** you add the at-risk marker so future changes are re-audited. + +### When audit passes + +- Set `gc=False` on the struct. +- Add an **at-risk comment** and docstring note: + + - **Above the class**: a short comment stating why gc=False is used despite the at-risk condition, and when the audit was done (e.g. `# gc=False: audit YYYY-MM: is only read, never mutated.`). + - **In the docstring**: a line that signals to future readers and to this skill that changes touching this struct must be re-audited. Use this format: + + `AT-RISK (gc=False): Has . Any change that must be audited; if so, remove gc=False.` + +- Example (for a struct with a `metadata` dict that is only ever read): + + ```python + # gc=False: audit 2026-03: metadata dict is only ever read, never mutated after construction. + class QueryResult(msgspec.Struct, ..., gc=False): + """Result of a completed inference query. + + AT-RISK (gc=False): Has mutable container field `metadata`. Any change that + mutates `metadata` after construction or stores this struct in a container + referenced by this struct must be audited; if so, remove gc=False. + ... + ``` + +### When touching an at-risk struct + +If you are adding or changing code that uses a struct marked AT-RISK (gc=False): + +1. Re-run the audit for that struct (searches above). +2. If your change mutates the at-risk field(s) or creates a cycle (e.g. stores the struct in its own container), **remove** `gc=False` from the struct and remove the at-risk comment/docstring line. +3. If your change does not touch the at-risk field or create cycles, the existing gc=False and at-risk comment remain; you may add a short note in the at-risk comment if the audit was re-checked (e.g. update the audit date). + +## References + +- [msgspec Structs – Disabling Garbage Collection](https://jcristharif.com/msgspec/structs.html#struct-gc) +- [msgspec Performance Tips – Use gc=False](https://jcristharif.com/msgspec/perf-tips.html#use-gc-false) +- [msgspec #631 – Generic structs and gc=False](https://github.com/jcrist/msgspec/issues/631) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 974d34fe..1ec8b590 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,13 +30,13 @@ jobs: run: | pytest -xv -m "not slow and not performance" --cov=src --cov-report=xml --cov-report=html - - name: Upload coverage to Codecov - uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0 + - name: Upload coverage report + uses: actions/upload-artifact@v4 with: - file: ./coverage.xml - flags: unittests - name: codecov-umbrella - fail_ci_if_error: false + name: coverage-report + path: | + coverage.xml + htmlcov/ audit: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index a7dce714..8dc22a68 100644 --- a/.gitignore +++ b/.gitignore @@ -189,5 +189,10 @@ outputs/ # Example vLLM virtualenv examples/03_BenchmarkComparison/vllm_venv/ -# Cursor artifacts (local development only) +# Agent artifacts (local development only) .cursor_artifacts/ +.claude/agent-memory/ + +# User-specific local rules (local Docker dev); do not commit +.cursor/rules/local-docker-dev.mdc +CLAUDE.local.md diff --git a/AGENTS.md b/AGENTS.md index d3683465..8bb4a38d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -73,7 +73,7 @@ CLI is auto-generated from `config/schema.py` Pydantic models via cyclopts. Fiel - **CLI mode** (`offline`/`online`): cyclopts constructs `OfflineBenchmarkConfig`/`OnlineBenchmarkConfig` (subclasses in `config/schema.py`) directly from CLI args. Type locked via `Literal`. `--dataset` is repeatable with TOML-style format `[perf|acc:][,key=value...]` (e.g. `--dataset data.csv,samples=500,parser.prompt=article`). Full accuracy support via `accuracy_config.eval_method=pass_at_1` etc. - **YAML mode** (`from-config`): `BenchmarkConfig.from_yaml_file()` loads YAML, resolves env vars, and auto-selects the right subclass via Pydantic discriminated union. Optional `--timeout`/`--mode` overrides via `config.with_updates()`. -- **eval**: Not yet implemented (raises `NotImplementedError`) +- **eval**: Not yet implemented (raises `CLIError` with a tracking issue link) ### Config Construction & Validation @@ -137,7 +137,11 @@ src/inference_endpoint/ │ └── utils.py # Port range helpers ├── async_utils/ │ ├── loop_manager.py # LoopManager (uvloop + eager_task_factory) +│ ├── runner.py # run_async() — uvloop + eager_task_factory entry point for CLI commands │ ├── event_publisher.py # Async event pub/sub +│ ├── services/ +│ │ ├── event_logger/ # EventLoggerService: writes EventRecords to JSONL/SQLite +│ │ └── metrics_aggregator/ # MetricsAggregatorService: real-time metrics (TTFT, TPOT, ISL, OSL) │ └── transport/ # ZMQ-based IPC transport layer │ ├── protocol.py # Transport protocols + TransportConfig base │ ├── record.py # Transport records @@ -192,7 +196,7 @@ tests/ ## Development Standards -### Code Style +### Code Style and Pre-commit Hooks - **Formatter/Linter**: `ruff` (line-length 88, target Python 3.12) - **Type checking**: `mypy` (via pre-commit) @@ -200,18 +204,12 @@ tests/ - **License headers**: Required on all Python files (enforced by pre-commit hook `scripts/add_license_header.py`) - **Conventional commits**: `feat:`, `fix:`, `docs:`, `test:`, `chore:` -### Pre-commit Hooks - -All of these run automatically on commit: - -- trailing-whitespace, end-of-file-fixer, check-yaml, check-merge-conflict, debug-statements -- `ruff` (lint + autofix) and `ruff-format` -- `mypy` type checking -- `prettier` for YAML/JSON/Markdown -- License header enforcement +All of these hooks run automatically on commit: trailing-whitespace, end-of-file-fixer, check-yaml, check-merge-conflict, debug-statements, `ruff` (lint + autofix), `ruff-format`, `mypy`, `prettier` (YAML/JSON/Markdown), license header enforcement. **Always run `pre-commit run --all-files` before committing.** +See [Development Guide](docs/DEVELOPMENT.md) for full setup and workflow details. + ### Data Types & Serialization - **Core types** (`Query`, `QueryResult`, `StreamChunk`): `msgspec.Struct` with `frozen=True`, `array_like=True`, `gc=False`, `omit_defaults=True` @@ -291,7 +289,7 @@ Update AGENTS.md as part of any PR that includes a **significant refactor**, mea - **Added or removed CLI commands/subcommands** — update CLI Modes and Common Commands - **Changed test infrastructure** (new fixtures, changed markers, new test directories) — update Testing section - **Added or removed key dependencies** — update Key Dependencies table -- **Changed build/tooling** (new pre-commit hooks, changed ruff config, new CI steps) — update Code Style and Pre-commit Hooks +- **Changed build/tooling** (new pre-commit hooks, changed ruff config, new CI steps) — update [docs/DEVELOPMENT.md](docs/DEVELOPMENT.md) - **Changed hot-path patterns** (new transport, changed serialization, new performance constraints) — update Performance Guidelines ### How to Update diff --git a/CLAUDE.md b/CLAUDE.md index 078c29c4..a0810401 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,4 +2,6 @@ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +Full guidance is maintained in AGENTS.md (shared with all AI coding agents) and is included below: + @AGENTS.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d90ed9c3..8de1bbe9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,3 +7,5 @@ Generally we encourage people to become MLCommons members if they wish to contri Regardless of whether you are a member, your organization (or you as an individual contributor) needs to sign the MLCommons Contributor License Agreement (CLA). Please submit your GitHub username to the [MLCommons Subscription form](https://mlcommons.org/community/subscribe/) to start that process. MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your pull requests. + +For project-specific development standards (code style, test requirements, pre-commit hooks, commit format), see the [Development Guide](docs/DEVELOPMENT.md). diff --git a/README.md b/README.md index 276ab997..9af4eb85 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ inference-endpoint benchmark offline \ ```bash # Start local echo server -python -m inference_endpoint.testing.echo_server --port 8765 & +python3 -m inference_endpoint.testing.echo_server --port 8765 & # Test with dummy dataset (included in repo) inference-endpoint benchmark offline \ @@ -94,33 +94,51 @@ pytest -m "not performance and not run_explicitly" ## 📚 Documentation +- [AGENTS.md](AGENTS.md) - Architecture, conventions, and AI agent guidelines - [CLI Quick Reference](docs/CLI_QUICK_REFERENCE.md) - Command-line interface guide - [Local Testing Guide](docs/LOCAL_TESTING.md) - Test with echo server - [Development Guide](docs/DEVELOPMENT.md) - How to contribute and develop +- [Performance Architecture](docs/PERF_ARCHITECTURE.md) - Hot-path design and tuning +- [Performance Tuning](docs/CLIENT_PERFORMANCE_TUNING.md) - CPU affinity and client tuning - [GitHub Setup Guide](docs/GITHUB_SETUP.md) - GitHub authentication and setup +### Component Design Specs + +Each top-level component under `src/inference_endpoint/` has a corresponding spec: + +| Component | Spec | +| ----------------- | ---------------------------------------------------------------- | +| Core types | [docs/core/DESIGN.md](docs/core/DESIGN.md) | +| Load generator | [docs/load_generator/DESIGN.md](docs/load_generator/DESIGN.md) | +| Endpoint client | [docs/endpoint_client/DESIGN.md](docs/endpoint_client/DESIGN.md) | +| Metrics | [docs/metrics/DESIGN.md](docs/metrics/DESIGN.md) | +| Config | [docs/config/DESIGN.md](docs/config/DESIGN.md) | +| Async utils | [docs/async_utils/DESIGN.md](docs/async_utils/DESIGN.md) | +| Dataset manager | [docs/dataset_manager/DESIGN.md](docs/dataset_manager/DESIGN.md) | +| Commands (CLI) | [docs/commands/DESIGN.md](docs/commands/DESIGN.md) | +| OpenAI adapter | [docs/openai/DESIGN.md](docs/openai/DESIGN.md) | +| SGLang adapter | [docs/sglang/DESIGN.md](docs/sglang/DESIGN.md) | +| Evaluation | [docs/evaluation/DESIGN.md](docs/evaluation/DESIGN.md) | +| Testing utilities | [docs/testing/DESIGN.md](docs/testing/DESIGN.md) | +| Profiling | [docs/profiling/DESIGN.md](docs/profiling/DESIGN.md) | +| Plugins | [docs/plugins/DESIGN.md](docs/plugins/DESIGN.md) | +| Utils | [docs/utils/DESIGN.md](docs/utils/DESIGN.md) | + ## 🎯 Architecture The system follows a modular, event-driven architecture: ``` -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Dataset │ │ Load │ │ Endpoint │ -│ Manager │───▶│ Generator │───▶│ Client │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ - │ │ │ - ▼ ▼ ▼ -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Metrics │ │ Configuration │ │ Endpoint │ -│ Collector │◄───│ Manager │ │ (External) │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ +Dataset Manager ──► Load Generator ──► Endpoint Client ──► External Endpoint + │ + Metrics Collector + (event logging + reporting) ``` -- **Load Generator**: Central orchestrator managing query lifecycle -- **Dataset Manager**: Handles benchmark datasets and preprocessing -- **Endpoint Client**: Abstract interface for endpoint communication -- **Metrics Collector**: Performance measurement and analysis -- **Configuration Manager**: System configuration (TBD) +- **Dataset Manager**: Loads benchmark datasets and applies transform pipelines +- **Load Generator**: Central orchestrator — controls timing (scheduler), issues queries, and emits sample events +- **Endpoint Client**: Multi-process HTTP worker pool communicating over ZMQ IPC +- **Metrics Collector**: Receives sample events from Load Generator; writes to SQLite (EventRecorder), aggregates after the run (MetricsReporter) ## Accuracy Evaluation @@ -132,14 +150,13 @@ configuration. Currently, Inference Endpoints provides the following pre-defined - LiveCodeBench (default: lite, release_v6) However, LiveCodeBench will not work out-of-the-box and requires some additional setup. See the -[LiveCodeBench](src/inference_endpoint/dataset_manager/predefined/livecodebench/README.md) documentation -for details and explanations. +[LiveCodeBench](src/inference_endpoint/evaluation/livecodebench/README.md) documentation for +details and explanations. ## 🚧 Pending Features The following features are planned for future releases: -- [ ] **Performance Tuning** - Advanced performance optimization features - [ ] **Submission Ruleset Integration** - Full MLPerf submission workflow support - [ ] **Documentation Generation and Hosting** - Sphinx-based API documentation with GitHub Pages @@ -166,7 +183,8 @@ We are grateful to these communities for their contributions to LLM benchmarking ## 📄 License -This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE.md) file for +details. ## 🔗 Links diff --git a/docs/CLI_DESIGN.md b/docs/CLI_DESIGN.md index e7ba2b72..21d980a9 100644 --- a/docs/CLI_DESIGN.md +++ b/docs/CLI_DESIGN.md @@ -172,9 +172,11 @@ InputValidationError 2 Bad user input, invalid config SetupError 3 Dataset load failure, connection error ExecutionError 4 Benchmark failed after setup CLIError 1 Generic CLI error (base class) -NotImplementedError 1 Unimplemented command (eval) ``` +The reserved `eval` command currently raises `CLIError` with a tracking issue link rather than a +dedicated exception type. + ## Development Guide ### Adding a CLI flag diff --git a/docs/CLI_QUICK_REFERENCE.md b/docs/CLI_QUICK_REFERENCE.md index 8fefdbe8..a6767933 100644 --- a/docs/CLI_QUICK_REFERENCE.md +++ b/docs/CLI_QUICK_REFERENCE.md @@ -1,13 +1,6 @@ # CLI Quick Reference -## Architecture - -The CLI is auto-generated from Pydantic models in `config/schema.py` using -cyclopts. schema.py is the single source of truth for both YAML configs and CLI flags. - -- **All schema fields** available as CLI flags on each subcommand (dotted kebab-case) -- **Shorthand aliases** declared via `cyclopts.Parameter(alias="--flag")` on schema fields -- **`${VAR}` interpolation** in YAML files (with `${VAR:-default}` fallback) +Command-line reference for all `inference-endpoint` subcommands, flags, load patterns, and usage examples. ## Commands @@ -109,6 +102,9 @@ Flag names shown as `--full.dotted.path --alias`. Both forms work. - `--endpoint-config.api-key --api-key` - API authentication - `--endpoint-config.api-type --api-type` - API type: openai/sglang (default: openai) - `--report-dir` - Report output directory + Note: applies to CLI-driven `benchmark offline` / `benchmark online`; `benchmark from-config` + does not expose a CLI override for `report_dir`. Set it in the YAML only if you need to control + the output location; otherwise a default report directory is used. - `--timeout` - Global timeout in seconds - `--enable-cpu-affinity / --no-cpu-affinity` - NUMA-aware CPU pinning (default: true) @@ -169,7 +165,7 @@ Accuracy config is supported in both CLI and YAML: inference-endpoint benchmark offline \ --endpoints URL --model M \ --dataset perf:perf.jsonl \ - --dataset acc:eval.jsonl,accuracy_config.eval_method=pass_at_1,accuracy_config.ground_truth=answer \ + --dataset acc:eval.jsonl,accuracy_config.eval_method=pass_at_1,accuracy_config.ground_truth=answer,accuracy_config.extractor=boxed_math_extractor \ --mode both ``` @@ -244,8 +240,9 @@ inference-endpoint init submission # 3. Run (YAML mode) inference-endpoint benchmark from-config \ - --config submission_template.yaml \ - --report-dir official_results + --config submission_template.yaml +# Note: from-config only accepts --config, --timeout, and --mode via CLI. +# Set report_dir in the YAML if you need a specific output location. ``` ### Validate First diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 0c21b705..2fe0e9c2 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -2,7 +2,7 @@ This guide provides everything you need to contribute to the MLPerf Inference Endpoint Benchmarking System. -## 🚀 Getting Started +## Getting Started ### Prerequisites @@ -14,40 +14,48 @@ This guide provides everything you need to contribute to the MLPerf Inference En ### Development Environment Setup ```bash -# 1. Clone the repository -git clone https://github.com/mlperf/inference-endpoint.git -cd inference-endpoint +# 1. Fork https://github.com/mlcommons/endpoints on GitHub, then clone your fork +git clone https://github.com/YOUR_USERNAME/endpoints.git +cd endpoints -# 2. Create virtual environment (Python 3.12+ required) +# 2. Add the upstream repo as a remote +git remote add upstream https://github.com/mlcommons/endpoints.git + +# 3. Create virtual environment (Python 3.12+ required) python3.12 -m venv venv source venv/bin/activate # On Windows: venv\Scripts\activate -# 3. Install development dependencies +# 4. Install development dependencies pip install -e ".[dev,test]" -# 4. Install pre-commit hooks +# 5. Install pre-commit hooks pre-commit install -# 5. Verify installation +# 6. Verify installation inference-endpoint --version pytest --version ``` -## 🏗️ Project Structure +## Project Structure ``` -inference-endpoint/ +endpoints/ ├── src/inference_endpoint/ # Main package source -│ ├── cli.py # Command-line interface +│ ├── main.py # Entry point and CLI app +│ ├── exceptions.py # Project-wide exception types +│ ├── async_utils/ # Event loop, ZMQ transport, pub/sub │ ├── commands/ # CLI command implementations │ ├── config/ # Configuration and schema management │ ├── core/ # Core types and orchestration │ ├── dataset_manager/ # Dataset handling and loading │ ├── endpoint_client/ # HTTP/ZMQ endpoint communication +│ ├── evaluation/ # Accuracy evaluation and scoring │ ├── load_generator/ # Load generation and scheduling │ ├── metrics/ # Performance measurement and reporting │ ├── openai/ # OpenAI API compatibility +│ ├── plugins/ # Plugin system │ ├── profiling/ # Performance profiling tools +│ ├── sglang/ # SGLang API adapter │ ├── testing/ # Test utilities (echo server, etc.) │ └── utils/ # Common utilities ├── tests/ # Test suite @@ -60,7 +68,7 @@ inference-endpoint/ └── scripts/ # Utility scripts ``` -## 🧪 Testing +## Testing ### Running Tests @@ -103,12 +111,14 @@ import pytest from inference_endpoint.core.types import Query class TestQuery: + @pytest.mark.unit def test_query_creation(self): """Test creating a basic query.""" - query = Query(prompt="Test", model="test-model") - assert query.prompt == "Test" - assert query.model == "test-model" + query = Query(data={"prompt": "Test", "model": "test-model"}) + assert query.data["prompt"] == "Test" + assert query.data["model"] == "test-model" + @pytest.mark.unit @pytest.mark.asyncio(mode="strict") async def test_async_operation(self): """Test async operations.""" @@ -116,11 +126,21 @@ class TestQuery: pass ``` -## 📝 Code Quality +## Code Quality ### Pre-commit Hooks -The project uses pre-commit hooks to ensure code quality: +The project uses pre-commit hooks to ensure code quality. + +Hooks that run automatically on commit: + +- trailing-whitespace, end-of-file-fixer, check-yaml, check-merge-conflict, debug-statements +- `ruff` (lint + autofix) and `ruff-format` +- `mypy` type checking +- `prettier` for YAML/JSON/Markdown +- License header enforcement (Apache 2.0 SPDX header required on all Python files, added by `scripts/add_license_header.py`) + +**Always run `pre-commit run --all-files` before committing.** ```bash # Install hooks (done during setup) @@ -131,13 +151,12 @@ pre-commit run # Run all hooks on all files pre-commit run --all-files - -# Skip hooks (use sparingly) -git commit --no-verify ``` ### Code Formatting +Configuration: `ruff` (line-length 88, target Python 3.12), `ruff-format` (double quotes, space indent). + ```bash # Format code with ruff ruff format src/ tests/ @@ -159,12 +178,17 @@ mypy src/ pre-commit run --all-files ``` -## 🔧 Development Workflow +## Development Workflow ### 1. Feature Development ```bash -# Create feature branch +# Sync your fork with upstream before starting +git fetch upstream +git checkout main +git merge upstream/main + +# Create a feature branch on your fork git checkout -b feature/your-feature-name # Make changes and test @@ -175,7 +199,7 @@ pre-commit run --all-files git add . git commit -m "feat: add your feature description" -# Push and create PR +# Push to your fork and open a PR against mlcommons/endpoints git push origin feature/your-feature-name ``` @@ -197,42 +221,15 @@ When developing a new component: - **Performance Tests**: Ensure no performance regressions - **Documentation**: Update docs for new features -## 📚 Documentation +## Documentation ### Writing Documentation -- **Code Comments**: Use docstrings for all public APIs +- **Code Comments**: Add comments only where the _why_ is not obvious from the code; avoid restating what the code does - **README Updates**: Update README.md for user-facing changes -- **API Documentation**: Document new interfaces and changes - **Examples**: Provide usage examples for new features -### Documentation Standards - -```python -def process_query(query: Query) -> QueryResult: - """ - Process a query and return the result. - - Args: - query: The query to process - - Returns: - QueryResult containing the processed response - - Raises: - QueryError: If the query cannot be processed - - Example: - >>> query = Query(prompt="Hello") - >>> result = process_query(query) - >>> print(result.content) - 'Hello there!' - """ - # Implementation here - pass -``` - -## 🚀 Performance Considerations +## Performance Considerations ### Development Guidelines @@ -254,7 +251,7 @@ pytest --benchmark-only pytest --benchmark-compare ``` -## 🔍 Debugging +## Debugging ### Common Issues @@ -276,7 +273,7 @@ pytest -s -v python -m pdb -m pytest test_file.py ``` -## 📦 Package Management +## Package Management ### Adding Dependencies @@ -291,7 +288,7 @@ Install after updating: pip install -e ".[dev,test]" ``` -## 🚨 Troubleshooting +## Troubleshooting ### Common Problems @@ -326,17 +323,20 @@ python -c "import sys; print(sys.path)" export PYTHONPATH="${PYTHONPATH}:$(pwd)/src" ``` -## 🤝 Contributing Guidelines +## Contributing Guidelines ### Pull Request Process -1. **Fork the repository** and create a feature branch -2. **Make your changes** following the coding standards -3. **Add tests** for new functionality -4. **Update documentation** as needed -5. **Run all checks** locally before submitting -6. **Create a PR** with clear description and tests -7. **Address review comments** promptly +1. **Fork** `mlcommons/endpoints` on GitHub +2. **Clone your fork** and add `upstream` as a remote (see [Development Environment Setup](#development-environment-setup)) +3. **Sync with upstream** (`git fetch upstream && git merge upstream/main`) before starting work +4. **Create a feature branch** on your fork (`git checkout -b feature/your-feature-name`) +5. **Make your changes** following the coding standards +6. **Add tests** for new functionality +7. **Update documentation** as needed +8. **Run all checks** locally: `pytest` and `pre-commit run --all-files` +9. **Push to your fork** and open a PR against `mlcommons/endpoints:main` +10. **Address review comments** promptly ### Commit Message Format @@ -351,6 +351,8 @@ docs(readme): update installation instructions test(loadgen): add performance benchmarks ``` +Allowed types: `feat`, `fix`, `docs`, `test`, `chore`, `refactor`, `perf`, `ci`. + ### Code Review Checklist - [ ] Code follows style guidelines @@ -360,20 +362,9 @@ test(loadgen): add performance benchmarks - [ ] Security implications are reviewed - [ ] Error handling is appropriate -## 📞 Getting Help +## Getting Help -- **Issues**: [GitHub Issues](https://github.com/mlperf/inference-endpoint/issues) -- **Discussions**: [GitHub Discussions](https://github.com/mlperf/inference-endpoint/discussions) +- **Issues**: [GitHub Issues](https://github.com/mlcommons/endpoints/issues) +- **Discussions**: [GitHub Discussions](https://github.com/mlcommons/endpoints/discussions) - **Documentation**: Check this guide and project docs - **Team**: Reach out to the development team - -## 🎯 Next Steps - -1. **Set up your environment** using this guide -2. **Explore the codebase** to understand the architecture -3. **Pick a component** to work on from the project board -4. **Start with tests** to understand the expected behavior -5. **Implement incrementally** with regular testing -6. **Ask questions** when you need help - -Happy coding! 🚀 diff --git a/docs/ENDPOINT_CLIENT.md b/docs/ENDPOINT_CLIENT.md index 5d604f7d..bbb4272d 100644 --- a/docs/ENDPOINT_CLIENT.md +++ b/docs/ENDPOINT_CLIENT.md @@ -1,4 +1,13 @@ -# MLPerf Inference Endpoints: HttpClient Design Document +# Endpoint Client Implementation Deep Dive + +> Primary component spec: [docs/endpoint_client/DESIGN.md](endpoint_client/DESIGN.md) +> +> This document is the detailed companion reference for the endpoint client implementation. Use +> it for deeper material on connection pool architecture, worker internals, SSE handling, and +> performance analysis. Treat `docs/endpoint_client/DESIGN.md` as the canonical high-level design +> spec. + +Detailed design for the `HTTPEndpointClient`: functional requirements, performance constraints, connection pool architecture, and worker process integration. ## Table of Contents @@ -231,9 +240,9 @@ class QueryStatus(Enum): **Classes:** -| Class | Source | Description | -| ------------------ | ----------- | -------------------------------------------------------- | -| `HTTPClientConfig` | `config.py` | `@dataclass`: client, worker pool, and connection config | +| Class | Source | Description | +| ------------------ | ----------- | ------------------------------------------------------- | +| `HTTPClientConfig` | `config.py` | `BaseModel`: client, worker pool, and connection config | ```python class APIType(str, Enum): diff --git a/docs/GITHUB_SETUP.md b/docs/GITHUB_SETUP.md index 30719bce..4a6d1070 100644 --- a/docs/GITHUB_SETUP.md +++ b/docs/GITHUB_SETUP.md @@ -18,6 +18,10 @@ Automatically requests code reviews on new PRs. Runs pytest test suite and generates coverage reports. +### CLA Workflow (`.github/workflows/cla.yml`) + +Checks contributor license agreement requirements. + ### PR Template (`.github/pull_request_template.md`) Standardized PR description template. @@ -30,7 +34,7 @@ Standardized PR description template. - Branch pattern: `main` - ✓ Require PR before merging (1 approval) -- ✓ Require status checks: pre-commit, test, branch-validator +- ✓ Require status checks: Pre-commit, Tests, cla-bot - ✓ Require conversation resolution - ✓ Auto-delete head branches @@ -73,10 +77,6 @@ Edit `.github/workflows/auto-review.yml`: const defaultReviewers = ["username1", "username2"]; ``` -### Modify Branch Patterns - -Edit `.github/workflows/branch-validator.yml` to add/remove allowed patterns. - ### Update PR Template Edit `.github/pull_request_template.md` for project-specific requirements. diff --git a/docs/LOCAL_TESTING.md b/docs/LOCAL_TESTING.md index aeb866b8..0abf90fa 100644 --- a/docs/LOCAL_TESTING.md +++ b/docs/LOCAL_TESTING.md @@ -1,5 +1,7 @@ # Local Testing Guide +How to run and test the CLI locally using the built-in echo server and the included dummy dataset, without a real inference endpoint. + ## Quick Start: Testing CLI with Echo Server ### 1. Prepare Test Environment @@ -13,10 +15,10 @@ The echo server is included for local testing and mirrors requests back as respo ```bash # Terminal 1: Start echo server on port 8765 -python -m inference_endpoint.testing.echo_server --port 8765 +python3 -m inference_endpoint.testing.echo_server --port 8765 # Or use default port 12345 -python -m inference_endpoint.testing.echo_server +python3 -m inference_endpoint.testing.echo_server ``` The server will log: @@ -72,7 +74,8 @@ Waiting for 5 responses... inference-endpoint -v benchmark offline \ --endpoints http://localhost:8765 \ --model Qwen/Qwen3-8B \ - --dataset tests/datasets/dummy_1k.jsonl + --dataset tests/datasets/dummy_1k.jsonl \ + --duration 0 # Production test with custom params and report generation inference-endpoint -v benchmark offline \ @@ -112,6 +115,7 @@ inference-endpoint -v benchmark online \ --endpoints http://localhost:8765 \ --model Qwen/Qwen3-8B \ --dataset tests/datasets/dummy_1k.jsonl \ + --duration 0 \ --load-pattern poisson \ --target-qps 100 \ --report-dir online_benchmark_report @@ -141,7 +145,7 @@ Cleaning up... inference-endpoint -v info # Generate template -inference-endpoint init --template offline +inference-endpoint init offline # Validate config inference-endpoint validate-yaml --config offline_template.yaml @@ -156,7 +160,9 @@ inference-endpoint benchmark offline \ ### 6. View Results -When run with `--report-dir`, a directory is created containing benchmark metrics files (JSON/CSV) with detailed QPS, latency, TTFT, and TPOT data. +A report directory is always created (at `--report-dir` if specified, or at a default path +otherwise), containing benchmark artifacts: `result_summary.json`, `runtime_settings.json`, +`sample_idx_map.json`, `report.txt`, and `events.jsonl`. ### 7. Stop the Echo Server @@ -170,10 +176,10 @@ pkill -f echo_server ```bash # Custom host and port -python -m inference_endpoint.testing.echo_server --host 0.0.0.0 --port 9000 +python3 -m inference_endpoint.testing.echo_server --host 0.0.0.0 --port 9000 # Check help -python -m inference_endpoint.testing.echo_server --help +python3 -m inference_endpoint.testing.echo_server --help ``` ## Request Format @@ -186,7 +192,7 @@ The echo server expects OpenAI-compatible format but simplifies it: { "prompt": "Your query text", "model": "model-name", - "max_tokens": 50, + "max_completion_tokens": 50, "stream": false } ``` @@ -206,7 +212,7 @@ Error: Connection failed ### Validation Errors ``` -Error: prompt not found in json_value +Error: prompt not found in query.data ``` **Solution:** Use `"prompt"` format in Query data, not `"messages"` (client converts it) @@ -225,7 +231,7 @@ Error: Timeout (>60s) ```bash # 1. Start echo server -python -m inference_endpoint.testing.echo_server --port 8000 & +python3 -m inference_endpoint.testing.echo_server --port 8000 & # 2. Generate fresh dataset if needed python scripts/create_dummy_dataset.py @@ -302,8 +308,9 @@ inference-endpoint benchmark online \ **Sample Count Control:** -- Sample priority: `--num-samples` > dataset size (duration=0) > calculated (target_qps × duration) -- Default duration: 0 (runs until dataset exhausted or max_duration reached) +- Use `--duration 0` when you want a local test to stop after exhausting the dataset instead of running for the default timed duration +- Sample priority: `--num-samples` > dataset size (when `--duration 0`) > calculated (target_qps × duration) +- Default duration: 600000ms (10 minutes) **Testing & Debugging:** diff --git a/docs/async_utils/DESIGN.md b/docs/async_utils/DESIGN.md new file mode 100644 index 00000000..8b1da236 --- /dev/null +++ b/docs/async_utils/DESIGN.md @@ -0,0 +1,134 @@ +# Async Utils — Design Spec + +> Async infrastructure shared across the system: uvloop event loop lifecycle management, ZMQ-based IPC transport between processes, and a pub/sub event bus for real-time metric streaming. + +**Component specs:** **async_utils** · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`async_utils/` provides the async infrastructure shared across the system: event loop lifecycle +management, ZMQ-based IPC transport, event pub/sub, and background services. All other components +depend on this package for their async primitives. + +## Responsibilities + +- Create and manage uvloop event loops with `eager_task_factory` +- Provide ZMQ IPC transport between the main process and worker processes +- Provide a pub/sub event bus for real-time metric streaming +- Host background services (event logger, metrics aggregator) as independent processes + +## Component Map + +``` +async_utils/ +├── loop_manager.py ← creates/tracks named uvloop event loops +├── event_publisher.py ← ZMQ-backed pub/sub for event records +├── runner.py ← async runner utilities +├── transport/ ← ZMQ IPC between processes +│ ├── protocol.py ← message framing definitions +│ └── zmq/ +│ ├── context.py ← managed ZMQ context lifecycle +│ ├── pubsub.py ← PUB/SUB socket pair +│ └── transport.py ← PUSH/PULL worker pool transport +└── services/ + ├── event_logger/ ← writes events to JSONL or SQLite (see DESIGN.md) + └── metrics_aggregator/ ← real-time metric computation (see DESIGN.md) +``` + +Sub-service specs: + +- [Event Logger](services/event_logger/DESIGN.md) +- [Metrics Aggregator](services/metrics_aggregator/DESIGN.md) + +## Public Interface + +### `LoopManager` + +Singleton via `SingletonMixin` — `LoopManager()` always returns the same instance. All event +loops in the process are created and tracked here. + +```python +class LoopManager(SingletonMixin): + def create_loop( + self, + name: str, + backend: Literal["uvloop", "asyncio"] = "uvloop", + task_factory_mode: Literal["eager", "lazy"] = "eager", + ) -> asyncio.AbstractEventLoop + + @property + def default_loop(self) -> asyncio.AbstractEventLoop + # The loop running on the main thread +``` + +The `task_factory_mode="eager"` setting installs Python 3.12's `eager_task_factory`, which runs +new coroutines synchronously until their first `await`. This eliminates a scheduling round-trip for +short-lived coroutines on the hot path. + +### `EventPublisherService` + +Singleton via `SingletonMixin` — after the first construction, subsequent calls return the +cached instance. The first construction requires a `ManagedZMQContext`. Subscribers receive +`EventRecord` messages over a ZMQ SUB socket. + +```python +class EventPublisherService(SingletonMixin, ZmqEventRecordPublisher): + def __init__( + self, + managed_zmq_context: ManagedZMQContext, + extra_eager: bool = False, + isolated_event_loop: bool = False, + ) -> None + + def publish(self, record: EventRecord) -> None +``` + +### ZMQ Transport + +The transport layer is not called directly by application code. `HTTPEndpointClient` and +`WorkerManager` construct `WorkerPoolTransport` via the factory in `transport/zmq/transport.py`. + +```python +# Protocol (async_utils/transport/protocol.py) +class WorkerPoolTransport(Protocol): + def send(self, worker_id: int, query: Query) -> None + def poll(self) -> QueryResult | StreamChunk | None + async def recv(self) -> QueryResult | StreamChunk | None +``` + +## Design Decisions + +**uvloop everywhere** + +uvloop replaces the default asyncio event loop with a libuv-backed implementation that reduces +per-event overhead. All event loops in the system — main process and workers — use uvloop unless +explicitly overridden for tests. + +**`eager_task_factory` for minimal await overhead** + +Python 3.12 introduced `eager_task_factory`, which runs a coroutine synchronously until its first +suspension point before scheduling it. On the hot path, many coroutines (e.g. `recv()` from an +already-full buffer) complete without ever suspending, eliminating a full scheduler round-trip. + +**ZMQ PUSH/PULL for worker IPC** + +PUSH/PULL sockets provide load-balanced, message-framed IPC without any acknowledgement overhead. +Messages are framed at the ZMQ layer, so the application never needs to handle partial reads or +message boundaries. The alternative (TCP + asyncio streams) requires manual framing and is slower +for small messages. + +**Inproc vs IPC socket selection** + +Workers on the same machine use ZMQ transports backed by a managed context. Depending on how that +context is created, the implementation may use `ipc://` or other ZMQ transport details internally. +Callers do not select this directly; `zmq/context.py` encapsulates it. + +## Integration Points + +| Consumer | Usage | +| -------------------------------- | --------------------------------------------------- | +| `endpoint_client/http_client.py` | Uses `WorkerPoolTransport` for worker communication | +| `endpoint_client/worker.py` | Runs its own uvloop via `LoopManager` | +| `async_utils/services/` | Background service processes subscribe via ZMQ SUB | diff --git a/docs/async_utils/services/design.md b/docs/async_utils/services/DESIGN.md similarity index 89% rename from docs/async_utils/services/design.md rename to docs/async_utils/services/DESIGN.md index f3736611..a26f1378 100644 --- a/docs/async_utils/services/design.md +++ b/docs/async_utils/services/DESIGN.md @@ -13,22 +13,20 @@ This document describes the design of the pub-sub system for **EventRecords**: h | **Publisher** | Main process (benchmark/loadgen) | Holds `EventPublisherService` (singleton). Binds a ZMQ PUB socket to an IPC (or TCP) address. Publishes `EventRecord` instances as events occur (e.g. sample issued, first token, complete). | | **Subscribers** | Same process and/or separate processes | Connect to the publisher's address via ZMQ SUB sockets. Each runs its own event loop (if async). Filter by topic and process batches of decoded `EventRecord`s. | -The publisher is created inside a **scoped** `ManagedZMQContext` (e.g. in the main process). The publisher binds via `ctx.bind(socket, path)` which constructs an IPC address from the context's `socket_dir` and the given path. Subscribers connect using the same `socket_dir` and path via `ctx.connect(socket, path)`. +The publisher is created inside a **scoped** `ManagedZMQContext` (e.g. in the main process). It binds a ZMQ PUB socket using a **socket name** within the context's socket directory (e.g. `ev_pub_`), producing an IPC path like `ipc:///path/to/socket_dir/ev_pub_`. The socket directory and socket name must be passed to any subscriber so it can connect. ### 1.2 In-process vs out-of-process subscribers -- **In-process**: Subscriber runs in the same process as the publisher but on a **different event loop** (e.g. `LoopManager().create_loop("subscriber_name")`). It uses the same `ManagedZMQContext` (same `socket_dir`) and connects using the publisher's `bind_path`. Example: `ConsoleSubscriber`, `DurationSubscriber` in the demo. -- **Out-of-process**: Subscriber runs in a **separate process** (e.g. `subprocess.Popen`). That process creates its own `ManagedZMQContext` with the publisher's **socket directory** via `--socket-dir`. It is also passed the socket name via `--socket-name` and connects using `ctx.connect(socket, socket_name)`. Example: **event_logger** and **metrics_aggregator** services launched from the demo/benchmark. +- **In-process**: Subscriber runs in the same process as the publisher but on a **different event loop** (e.g. `LoopManager().create_loop("subscriber_name")`). It uses the same `ManagedZMQContext` (same `socket_dir`) and connects via `ctx.connect(socket, publisher.bind_path)`. Example: `ConsoleSubscriber`, `DurationSubscriber` in the demo. +- **Out-of-process**: Subscriber runs in a **separate process** (e.g. `subprocess.Popen`). That process creates its own `ManagedZMQContext` with a **shared socket directory** (e.g. `socket_dir=log_dir.parent` so the IPC path exists and is writable). It is passed the publisher's socket directory and socket name via CLI (`--socket-dir ` and `--socket-name `) and connects via `ctx.connect(socket, socket_name)`. Example: **event_logger** and **metrics_aggregator** services launched from the demo/benchmark. So: one publisher process; zero or more subscribers in the same process (each with its own loop) and/or in child processes (each with its own context and loop). All subscribers share the same logical stream of events (subject to topic filters and to de-sync; see §3). ### 1.3 ZMQ context and socket directory -- **ManagedZMQContext** is a per-process singleton. It owns the ZMQ context and an optional **socket directory** used for IPC paths. -- All socket binding and connecting goes through `ctx.bind(socket, path)` and `ctx.connect(socket, path)`. These methods construct the full IPC address (`ipc:///`) using `urllib.parse.urlunparse`. -- On `bind()`: if `socket_dir` is `None`, a temporary directory is created automatically. If it's a string, the directory is created via `mkdir(parents=True, exist_ok=True)`. -- On `connect()`: `socket_dir` must already be set (either via `__init__` or a prior `bind()`). This ensures subscribers point to the publisher's actual socket directory. -- Subscribers in the same process use the same context and `socket_dir`. Subscribers in another process receive the `socket_dir` and socket name via CLI args (`--socket-dir`, `--socket-name`). +- **ManagedZMQContext** is a per-process singleton. It owns the ZMQ context and a **socket directory** used for IPC paths. +- The publisher **binds** in that directory (e.g. `ipc://{socket_dir}/ev_pub_{uuid}`). +- Subscribers in the same process use the same context and connect via `ctx.connect(socket, publisher.bind_path)`. Subscribers in another process must be given the same **socket directory** and the **socket name** (the relative name used when binding), so the parent passes `--socket-dir` and `--socket-name` to the child subprocess. - Cleanup: when the scoped context in the publisher process exits, it closes sockets and terminates the context; the publisher also unlinks the IPC file. Subscriber processes must connect before the publisher exits and should shut down when they see `session.ended` (or similar) to avoid using a closed socket. ### 1.4 Process architecture diagram @@ -303,14 +301,14 @@ stateDiagram-v2 - **Role**: Subscribes to all (or a configured set of) topics and **persists** event records. - **Outputs**: JSONL and/or SQL (SQLAlchemy; default sqlite). Writers are pluggable (`RecordWriter`); each record is written to all configured writers. -- **Lifecycle**: On **session.ended**, the ENDED record is written, all subsequent events in the batch are dropped, and the service flushes and closes writers and signals shutdown (e.g. sets an event so the process can exit). No events of any kind are accepted after **session.ended**. -- **Process**: Typically run as a **subprocess**; given `--log-dir`, `--socket-dir`, `--socket-name` (and optionally `--writers jsonl sql`). Creates `ManagedZMQContext.scoped(socket_dir=...)` with the publisher's socket directory and connects via `ctx.connect(socket, socket_name)`. +- **Lifecycle**: On **session.ended**, it writes the ENDED record, then drops all subsequent events (including error events) in any later batch, flushes and closes writers, and signals shutdown. Note: the current implementation does not make an exception for error events — all records after ENDED are discarded. +- **Process**: Typically run as a **subprocess**; given `--log-dir`, `--socket-dir`, `--socket-name`, and optionally `--writers jsonl sql`. Uses the same socket directory as the publisher so the IPC path is valid. ### 6.2 Metrics aggregator - **Role**: Subscribes to EventRecords and derives real-time metrics (e.g. TTFT, sample latency, token counts). May use a tokenizer pool for token-based metrics. Shuts down on **session.ended**. - **Outputs**: Planned is to push real time metrics to Prometheus via PushGateway. Currently, logging / writing final report to JSON is sufficient legacy behavior. -- **Process**: Run as a **subprocess**; given `--metrics-dir`, `--socket-dir`, `--socket-name`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC socket via `ctx.connect(socket, socket_name)`. +- **Process**: Run as a **subprocess**; given `--metrics-dir`, `--socket-dir`, `--socket-name`, and optional tokenizer options. Uses a dedicated event loop and `ManagedZMQContext.scoped(socket_dir=...)` so it can connect to the publisher's IPC address. --- diff --git a/docs/async_utils/services/event_logger/design.md b/docs/async_utils/services/event_logger/DESIGN.md similarity index 97% rename from docs/async_utils/services/event_logger/design.md rename to docs/async_utils/services/event_logger/DESIGN.md index 8da4a791..b50d3d69 100644 --- a/docs/async_utils/services/event_logger/design.md +++ b/docs/async_utils/services/event_logger/DESIGN.md @@ -1,5 +1,7 @@ # Event Logger Service — Design Document +> ZMQ subscriber service that consumes `EventRecord` messages from the pub/sub bus and persists them to JSONL or SQLite storage backends; runs as an independent subprocess. + ## Overview The event logger is a ZMQ subscriber service that consumes `EventRecord` messages diff --git a/docs/async_utils/services/metrics_aggregator/DESIGN.md b/docs/async_utils/services/metrics_aggregator/DESIGN.md new file mode 100644 index 00000000..7ff6d0d3 --- /dev/null +++ b/docs/async_utils/services/metrics_aggregator/DESIGN.md @@ -0,0 +1,389 @@ +# Metrics Aggregator Service — Design Document + +## Overview + +The metrics aggregator is a ZMQ subscriber service that consumes `EventRecord` messages +from the pub/sub event bus, computes per-sample metrics in real time, and pushes them +to a `MetricEmitter` backend (currently JSONL; future: Prometheus PushGateway). + +It runs as an independent subprocess with its own event loop, connected to the same +ZMQ PUB socket as the EventLoggerService. + +``` + ZMQ PUB (ipc://) + │ + ┌──────────────┼──────────────┐ + ▼ ▼ ▼ + EventLogger MetricsAggregator (future subscribers) + (JSONL/SQL) (real-time metrics) +``` + +## Module Layout + +``` +metrics_aggregator/ +├── __init__.py +├── __main__.py # CLI entry point +├── aggregator.py # MetricsAggregatorService (ZmqEventRecordSubscriber) +├── emitter.py # MetricEmitter ABC, JsonlMetricEmitter +├── metrics_table.py # SampleRow, MetricsTable +└── token_metrics.py # TokenizePool (thread-pool tokenizer) +``` + +## Subscribed Events + +### Session Events + +| Event | Effect | +| --------------------------------------------- | ---------------------------------------------------- | +| `SessionEventType.STARTED` | Records session start timestamp | +| `SessionEventType.START_PERFORMANCE_TRACKING` | Sets `is_tracking = True` | +| `SessionEventType.STOP_PERFORMANCE_TRACKING` | Sets `is_tracking = False` | +| `SessionEventType.ENDED` | Flushes emitter, closes subscriber, signals shutdown | + +### Sample Events + +| Event | Stored Field | Metric Emitted | Formula | +| ------------------ | --------------------------------------------------- | ------------------------------------- | -------------------------------------------------------- | +| `ISSUED` | `issued_ns` | `isl` | `len(token_ids)` or `token_count(text)` via `PromptData` | +| `RECV_FIRST` | `recv_first_ns`, `last_recv_ns`, `first_chunk_text` | `ttft_ns` | `recv_first_ns - issued_ns` | +| `RECV_NON_FIRST` | `last_recv_ns` (updated) | `chunk_delta_ns` | `timestamp - last_recv_ns` | +| `CLIENT_SEND` | `client_send_ns` | — | — | +| `CLIENT_RESP_DONE` | `client_resp_done_ns` | `request_duration_ns` | `client_resp_done_ns - client_send_ns` | +| `COMPLETE` | `complete_ns` | `sample_latency_ns`, `osl`, `tpot_ns` | see below | + +Ignored sample events: `TRANSPORT_SENT`, `TRANSPORT_RECV` (infrastructure-level, not +relevant for user-facing metrics). + +## Performance Tracking Window + +The `is_tracking` flag gates which samples are tracked: + +``` + STARTED ENDED + │ │ + ▼ ▼ +────┬─────────┬─────────────────────────────┬──────────────────┬── + │ │ ◄── samples issued here │ │ + │ START_PERF_TRACKING STOP_PERF_TRACKING │ + │ │ are tracked │ │ + │ │ │ │ +``` + +- A sample is tracked **if and only if** its `ISSUED` event arrives while `is_tracking` is `True`. +- Once tracked, a sample continues to receive events and emit metrics regardless of + later `STOP_PERFORMANCE_TRACKING` events. Only new ISSUEs are blocked. +- This allows warmup queries (before START) and cooldown queries (after STOP) to be + excluded from reported metrics while still draining in-flight samples cleanly. + +## Data Model: SampleRow + +Each tracked sample gets a `SampleRow` — a `msgspec.Struct` with `gc=False` that +stores raw `int | None` nanosecond timestamps and accumulated text: + +``` +SampleRow +├── sample_uuid: str +├── issued_ns: int | None ← set on ISSUED +├── complete_ns: int | None ← set on COMPLETE +├── recv_first_ns: int | None ← set on RECV_FIRST +├── last_recv_ns: int | None ← set on RECV_FIRST, updated on each RECV_NON_FIRST +├── client_send_ns: int | None ← set on CLIENT_SEND +├── client_resp_done_ns: int | None ← set on CLIENT_RESP_DONE +├── prompt_text: str | None ← from ISSUED event data (for ISL tokenization) +├── first_chunk_text: str | None ← from RECV_FIRST event data (for TPOT denominator) +├── first_chunk_tokens: int | None ← token count of first_chunk_text, resolved after async tokenization +└── output_chunks: list[str] ← accumulated from RECV_FIRST/RECV_NON_FIRST data +``` + +Metric formulas are simple methods on the row: + +```python +def ttft_ns(self) -> int | None: # recv_first_ns - issued_ns +def sample_latency_ns(self) -> int | None: # complete_ns - issued_ns +def request_duration_ns(self) -> int | None: # client_resp_done_ns - client_send_ns +def output_text(self) -> str: # "".join(output_chunks) +``` + +Rows are created on ISSUED and removed on COMPLETE. + +### Design Rationale: Why Not a Declarative Field System + +An earlier iteration used `_MetricField` structs with `delta_start_field_prio` lists +to declaratively describe which field pairs produce which metrics. This was abandoned +because: + +1. The formulas are few and fixed — a declarative DSL adds indirection without flexibility. +2. String-based field lookups at runtime obscure the actual data flow. +3. The metric emission logic was coupled into the data storage layer (`set_field` both + stored a timestamp and emitted a metric), making it hard to test or reason about. +4. Special cases (`mutable` flag for `recv_non_first`, `msgspec.UNSET` sentinels) + added complexity for what is fundamentally `int | None`. + +The current design keeps data storage (SampleRow) separate from metric emission +(aggregator event handlers). Each handler is 5-15 lines, reads top-to-bottom, and +is independently testable. + +## Metrics Computed + +### Timing Metrics (emitted immediately on triggering event) + +| Metric | Emitted On | Formula | Notes | +| --------------------- | ---------------- | -------------------------------------- | ---------------------------------------------------------------------------------------------------------- | +| `ttft_ns` | RECV_FIRST | `recv_first_ns - issued_ns` | Time to first token. Streaming only. | +| `sample_latency_ns` | COMPLETE | `complete_ns - issued_ns` | End-to-end latency from issue to completion. | +| `request_duration_ns` | CLIENT_RESP_DONE | `client_resp_done_ns - client_send_ns` | HTTP-level request time (inside worker process). | +| `chunk_delta_ns` | RECV_NON_FIRST | `timestamp - last_recv_ns` | Inter-token arrival time. `last_recv_ns` starts at `recv_first_ns` and advances with each non-first chunk. | + +### Token Metrics (require tokenization, may be async) + +| Metric | Emitted On | Formula | Notes | +| --------- | -------------------- | ------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `isl` | ISSUED | `len(token_ids)` or `token_count(text)` | Input sequence length. ISSUED event carries `PromptData` with either `token_ids` (SGLang, emitted synchronously) or `text` (OpenAI, tokenized async). | +| `osl` | COMPLETE (awaited) | `token_count(output_text)` | Output sequence length. Output text is from accumulated chunks (streaming) or COMPLETE data (non-streaming). | +| `tpot_ns` | COMPLETE (after OSL) | `(complete_ns - recv_first_ns) / (osl - first_chunk_tokens)` | Time per output token after the first chunk. The first chunk may contain multiple tokens, so `first_chunk_text` is tokenized separately for the denominator. Only emitted for streaming responses where `osl - first_chunk_tokens > 0`. | + +## Event Dispatch Flow + +``` +process(records: list[EventRecord]) +│ +├── for each record: +│ ├── Session events → update is_tracking / session state +│ │ +│ └── Sample events (if sample_uuid non-empty): +│ ├── ISSUED +│ │ ├── if not is_tracking: skip +│ │ ├── create SampleRow in MetricsTable +│ │ ├── store issued_ns +│ │ ├── store prompt_text from record.data (if str) +│ │ └── schedule ISL tokenization (async, fire-and-forget) +│ │ +│ ├── RECV_FIRST +│ │ ├── lookup row (skip if not tracked) +│ │ ├── store recv_first_ns, last_recv_ns +│ │ ├── emit ttft_ns +│ │ └── append record.data to output_chunks +│ │ +│ ├── RECV_NON_FIRST +│ │ ├── lookup row (skip if not tracked) +│ │ ├── emit chunk_delta_ns (from last_recv_ns) +│ │ ├── update last_recv_ns +│ │ └── append record.data to output_chunks +│ │ +│ ├── CLIENT_SEND +│ │ └── store client_send_ns +│ │ +│ ├── CLIENT_RESP_DONE +│ │ ├── store client_resp_done_ns +│ │ └── emit request_duration_ns +│ │ +│ └── COMPLETE +│ ├── store complete_ns +│ ├── emit sample_latency_ns +│ ├── await OSL tokenization → emit osl +│ ├── if streaming and osl > first_chunk_tokens → emit tpot_ns +│ └── remove row from MetricsTable +│ +└── if ENDED seen: flush emitter, close subscriber, signal shutdown +``` + +## MetricEmitter + +The `MetricEmitter` ABC defines: + +```python +class MetricEmitter(ABC): + def emit(self, sample_uuid: str, metric_name: str, value: int | float) -> None: ... + def flush(self) -> None: ... + def close(self) -> None: ... +``` + +### JsonlMetricEmitter (current implementation) + +Writes one JSON line per metric: + +```json +{"sample_uuid":"a1b2c3...","metric_name":"ttft_ns","value":1500,"timestamp_ns":98765432100} +{"sample_uuid":"a1b2c3...","metric_name":"sample_latency_ns","value":4000,"timestamp_ns":98765436100} +``` + +Uses `msgspec.json.Encoder` for serialization. Supports a configurable `flush_interval` +(flush to disk every N records). `timestamp_ns` is the wall-clock time when the metric +was emitted (not the event timestamp). + +### Future: PrometheusEmitter + +Would push to Prometheus PushGateway. The `emit()` signature supports this — +`metric_name` maps to a Prometheus metric, `sample_uuid` becomes a label, +`value` is the observation. Histograms/summaries can be built by accumulating +values per metric name. + +## TokenizePool + +Thread-pool wrapper around HuggingFace `AutoTokenizer` for ISL/OSL/TPOT computation. + +### Architecture + +``` + TokenizePool + ┌─────────────┐ + │ ThreadPool │ + token_count("text")──► Executor │ + (blocking) │ ┌───────┐ │ + │ │Thread1│──► thread-local AutoTokenizer + │ │Thread2│──► thread-local AutoTokenizer + │ │ ... │ │ + │ └───────┘ │ + └─────────────┘ +``` + +### Thread-Safety Analysis + +- **`ThreadPoolExecutor.submit()`** is internally synchronized — safe to call from + any thread. +- **Thread-local tokenizer instances** (`threading.local()`) mean zero shared mutable + state during tokenization. Each worker thread lazily loads its own + `AutoTokenizer.from_pretrained()` on first use. +- **HuggingFace tokenizers** (Rust backend via `tokenizers` crate) release the GIL + during the core tokenization work, so multiple threads actually run in parallel. +- **Blocking vs async**: `tokenize()` and `token_count()` block the calling thread + on `future.result()`. In async context, use `token_count_async()` which wraps the + call in `loop.run_in_executor(None, ...)` to avoid blocking the event loop. + +### Why `run_in_executor` for async? + +The `token_count_async` method uses a double-hop: `event loop executor → TokenizePool executor`. +This seems redundant but is necessary because: + +1. The aggregator's `process()` runs as an async task on the event loop. +2. Calling `pool.token_count()` directly would block the loop (the `future.result()` + inside `token_count()` is a synchronous wait). +3. `run_in_executor` offloads the blocking call to a thread, freeing the loop to + continue processing events. + +The inner `ThreadPoolExecutor` in `TokenizePool` still provides the thread-local +tokenizer isolation. The outer executor just prevents the blocking wait from starving +the event loop. + +## ISL Tracking: How the Prompt Gets to the Aggregator + +### Current Design + +The `ISSUED` event's `data` field carries a `PromptData` struct with either: + +- `text: str` — raw prompt string (OpenAI path), tokenized async by the aggregator. +- `token_ids: tuple[int, ...]` — pre-tokenized token IDs (SGLang/Harmonize path), + ISL is `len(token_ids)` with no tokenization needed. + +`EventRecord.data` is typed as `TextModelOutput | PromptData | ErrorData | None`. + +### Where to Publish + +The ISSUED event is published in the load generator when `issue_sample()` is called. +At that point, `sample.data` contains the post-transform dataset row. The publisher +extracts the prompt: + +```python +# In the load generator, when issuing a sample: +if "input_tokens" in sample.data: + prompt_data = PromptData(token_ids=tuple(sample.data["input_tokens"])) +elif "prompt" in sample.data: + prompt_data = PromptData(text=sample.data["prompt"]) +else: + prompt_data = None + +publisher.publish(EventRecord( + event_type=SampleEventType.ISSUED, + sample_uuid=sample.uuid, + data=prompt_data, +)) +``` + +### Adapter Considerations + +The prompt data available at ISSUED time is **post-transform** — dataset transforms +have already been applied by this point. This matters because: + +| Adapter | Transform Pipeline | `sample.data` at ISSUED | `PromptData` | +| ----------------------- | --------------------------------------------- | ----------------------------------- | ------------------------------------------- | +| OpenAI / OpenAI-Msgspec | `ColumnFilter → AddStaticColumns` | `{"prompt": "...", "model": "..."}` | `PromptData(text=prompt)` | +| SGLang | `Harmonize → ColumnFilter → AddStaticColumns` | `{"input_tokens": [int, ...]}` | `PromptData(token_ids=tuple(input_tokens))` | + +## Lifecycle + +### Startup + +```python +python -m inference_endpoint.async_utils.services.metrics_aggregator \ + --metrics-dir /tmp/metrics \ + --socket-dir /path/to/socket_dir \ + --socket-name ev_pub_ \ + --tokenizer gpt2 \ + --tokenizer-workers 2 +``` + +1. Create `TokenizePool` (if `--tokenizer` provided) +2. Create `JsonlMetricEmitter` writing to `/metrics.jsonl` +3. Create `MetricsAggregatorService` connected to the ZMQ PUB socket +4. `aggregator.start()` adds the ZMQ socket reader to the event loop +5. `await shutdown_event.wait()` blocks until ENDED is received + +### Shutdown + +On `SessionEventType.ENDED`: + +1. `_finalize()` flushes the emitter +2. `close()` closes the emitter file and removes the ZMQ socket reader +3. `shutdown_event.set()` unblocks the main coroutine +4. `TokenizePool.close()` shuts down worker threads (in `finally` block) + +### Graceful Drain + +Events received in the same batch as ENDED are processed (the `_shutdown_received` +flag is checked at the top of the loop, so events before ENDED in the batch still +get handled). Events in subsequent batches are dropped. + +In-flight samples that never receive COMPLETE will be abandoned (their rows stay in +the table but are never emitted). This is expected — if the session ends, those +samples didn't complete. + +## Output Format + +### JSONL Example (streaming sample) + +```json +{"sample_uuid":"a1b2c3d4","metric_name":"isl","value":42,"timestamp_ns":100000000} +{"sample_uuid":"a1b2c3d4","metric_name":"ttft_ns","value":1500000,"timestamp_ns":100001500} +{"sample_uuid":"a1b2c3d4","metric_name":"chunk_delta_ns","value":500000,"timestamp_ns":100002000} +{"sample_uuid":"a1b2c3d4","metric_name":"chunk_delta_ns","value":600000,"timestamp_ns":100002600} +{"sample_uuid":"a1b2c3d4","metric_name":"request_duration_ns","value":3800000,"timestamp_ns":100003800} +{"sample_uuid":"a1b2c3d4","metric_name":"sample_latency_ns","value":4000000,"timestamp_ns":100004000} +{"sample_uuid":"a1b2c3d4","metric_name":"osl","value":28,"timestamp_ns":100004001} +{"sample_uuid":"a1b2c3d4","metric_name":"tpot_ns","value":92592.6,"timestamp_ns":100004001} +``` + +### JSONL Example (non-streaming sample) + +```json +{"sample_uuid":"e5f6a7b8","metric_name":"isl","value":15,"timestamp_ns":200000000} +{"sample_uuid":"e5f6a7b8","metric_name":"request_duration_ns","value":2500000,"timestamp_ns":200002500} +{"sample_uuid":"e5f6a7b8","metric_name":"sample_latency_ns","value":3000000,"timestamp_ns":200003000} +{"sample_uuid":"e5f6a7b8","metric_name":"osl","value":50,"timestamp_ns":200003001} +``` + +Note: no `ttft_ns`, `chunk_delta_ns`, or `tpot_ns` for non-streaming — these require +`RECV_FIRST` which only occurs in streaming mode. + +## Not Yet Wired + +The EventRecord pub/sub infrastructure is ready, but actual `publish(EventRecord(...))` +calls for sample events have not been connected in the load generator or worker +processes. What needs to happen: + +1. **Load generator** (`load_generator.py` / `session.py`): Publish `ISSUED` with + prompt text, `START/STOP_PERFORMANCE_TRACKING`, `STARTED`, `ENDED`. +2. **Worker** (`worker.py`): Publish `CLIENT_SEND`, `CLIENT_RESP_DONE`, + `RECV_FIRST`, `RECV_NON_FIRST`, `COMPLETE` with response data. +3. **Session orchestrator**: Spawn the metrics aggregator subprocess alongside + the event logger subprocess, passing the same ZMQ socket address. diff --git a/docs/async_utils/services/metrics_aggregator/design.md b/docs/async_utils/services/metrics_aggregator/design.md deleted file mode 100644 index 4fbf769c..00000000 --- a/docs/async_utils/services/metrics_aggregator/design.md +++ /dev/null @@ -1,461 +0,0 @@ -# Metrics Aggregator Service — Design Document - -## Overview - -The metrics aggregator receives `EventRecord` messages from a ZMQ SUB socket, -computes per-sample metrics in real time, and pushes them to a `MetricEmitter` -backend (currently JSONL; future: Prometheus PushGateway). - -```mermaid -flowchart LR - Socket[ZMQ SUB socket] --> Dispatch[Event dispatch] - Dispatch --> Row[Create / update SampleRow] - Row --> Check{Triggers\na metric?} - Check -->|Yes| Task[Compute metric] - Check -->|No| Done[Done] - Task --> Emit[MetricEmitter.emit] -``` - -## Module Layout - -``` -metrics_aggregator/ -├── __init__.py -├── __main__.py # CLI entry point -├── aggregator.py # MetricsAggregatorService (thin event router) -├── emitter.py # MetricEmitter ABC, JsonlMetricEmitter -├── metrics_table.py # SampleRow, TrackedBlock, MetricsTable, EmitTrigger, triggers -└── token_metrics.py # TokenizePool (thread-pool tokenizer) -``` - -## Architecture - -### Component Roles - -| Component | Responsibility | -| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **MetricsAggregatorService** | Thin event router. Receives EventRecord batches, dispatches session events to `MetricsTable.handle_session_event()` and sample events to `MetricsTable.set_field()`. Owns shutdown logic. | -| **MetricsTable** | Owns sample rows, session state, trigger registry, tracked blocks, and in-flight task tracking. Handles row lifecycle (create on ISSUED, remove on COMPLETE), trigger dispatch, and tracked duration bookkeeping. | -| **SampleRow** | Pure data container (`msgspec.Struct, gc=False`). Holds per-sample timestamps and a `tracked_block_idx` linking it to its tracking window. No methods, no trigger awareness. | -| **EmitTrigger** | ABC for metric computations. Each trigger binds runtime deps (emitter, tokenize_pool, loop) at construction. `fire(ev_rec, row, pre_change)` is called by MetricsTable when a field is set. Must be non-blocking; returns an `asyncio.Task` if async work is needed. | -| **TrackedBlock** | Per-tracking-window duration state. Tracks `start_ns`, `last_complete_ns`, and `completed_samples`. Duration extends to the last sample completion, not to STOP_PERFORMANCE_TRACKING. | - -### Trigger System - -Triggers are registered on `MetricsTable` at aggregator construction time: - -```python -table = MetricsTable() -table.add_trigger("recv_first_ns", TtftTrigger(emitter)) -table.add_trigger("complete_ns", SampleLatencyTrigger(emitter)) -table.add_trigger("complete_ns", OslTpotTrigger(emitter, tokenize_pool, loop)) -``` - -Each trigger has: - -- `metric_name`: identifies the metric being computed. -- `requires`: tuple of SampleRow field names whose **pre-change** values are - snapshotted and passed to `fire()`. - -When `MetricsTable.set_field(uuid, field_name, value, ev_rec)` is called: - -1. Look up the row (or create it if ISSUED + tracking is on). -2. For each trigger registered on `field_name`: - a. Snapshot `{attr: getattr(row, attr) for attr in trigger.requires}`. - b. Call `trigger.fire(ev_rec, row, pre_change)`. - c. If a Task is returned, add it to `_in_flight_tasks`. -3. Set `row.field_name = value`. -4. If COMPLETE, update the tracked block and remove the row. - -This means triggers see the row state **before** the update. This is critical for -`chunk_delta_ns`, which needs the previous `last_recv_ns` value. - -## Subscribed Events - -### Session Events - -| Event | Effect | -| --------------------------------------------- | -------------------------------------------------------------------------- | -| `SessionEventType.STARTED` | Records `session_started_ns` on MetricsTable | -| `SessionEventType.START_PERFORMANCE_TRACKING` | Sets `is_tracking = True`, opens a new `TrackedBlock` | -| `SessionEventType.STOP_PERFORMANCE_TRACKING` | Sets `is_tracking = False` (tracked blocks keep extending via completions) | -| `SessionEventType.ENDED` | Triggers shutdown: drain in-flight tasks, then finalize | - -### Sample Events - -| Event | Field Set | Trigger(s) Fired | -| ------------------ | ------------------------------- | --------------------------------------------------------------------------------------------- | -| `ISSUED` | `issued_ns` | `IslTrigger` | -| `RECV_FIRST` | `recv_first_ns`, `last_recv_ns` | `TtftTrigger`, `FirstChunkTokenCountTrigger`, `ChunkDeltaTrigger` (skips: pre-change is None) | -| `RECV_NON_FIRST` | `last_recv_ns` | `ChunkDeltaTrigger` | -| `CLIENT_SEND` | `client_send_ns` | (none) | -| `CLIENT_RESP_DONE` | `client_resp_done_ns` | `RequestDurationTrigger` | -| `COMPLETE` | `complete_ns` | `SampleLatencyTrigger`, `OslTpotTrigger` | - -Ignored: `TRANSPORT_SENT`, `TRANSPORT_RECV` (infrastructure-level). - -## Performance Tracking and Tracked Duration - -### Tracking Windows - -`is_tracking` defaults to `False`. No samples are tracked until -`START_PERFORMANCE_TRACKING` is received. This allows warmup queries to be excluded. - -``` - STARTED ENDED - │ │ - ▼ ▼ -────┬─────────┬───────────────────────────────┬──────────────────┬── - │ warmup │ ◄── samples issued here │ cooldown │ - │ │ are tracked │ │ - │ START_PERF_TRACKING STOP_PERF_TRACKING │ - │ │ │ │ - │ │◄── block 0 ──────────────────►│ │ - │ │ (extends to last completion) │ │ -``` - -- A sample is tracked **if and only if** its ISSUED event arrives while - `is_tracking == True`. -- Once tracked, the sample continues to receive events and emit metrics - regardless of later STOP events. -- Duplicate START events (while already tracking) are no-ops. - -### TrackedBlock - -Each `START_PERFORMANCE_TRACKING` (when not already tracking) opens a new -`TrackedBlock`: - -```python -@dataclass(slots=True) -class TrackedBlock: - start_ns: int # timestamp of START_PERFORMANCE_TRACKING - last_complete_ns: int # max completion timestamp (init = start_ns) - completed_samples: int = 0 # count of completions in this block -``` - -When a tracked sample completes: - -- `block.last_complete_ns = max(block.last_complete_ns, complete_ns)` -- `block.completed_samples += 1` - -`block.duration_ns = last_complete_ns - start_ns` - -This means block duration extends to the **last tracked completion**, not to -STOP_PERFORMANCE_TRACKING. Samples issued during tracking but completing after -STOP still contribute to their block's duration. - -### Aggregate Metrics - -``` -total_tracked_duration_ns = sum(block.duration_ns for block in tracked_blocks) -total_completed_tracked_samples = sum(block.completed_samples for block in tracked_blocks) -QPS = total_completed_tracked_samples / total_tracked_duration_ns -``` - -An empty block (START → STOP with no samples) has `duration_ns = 0` and -`completed_samples = 0`, contributing nothing to either sum. - -### Multiple Tracking Windows Example - -``` -t=0: START → block 0 (start=0) -t=100: ISSUED s1 (block_idx=0) -t=200: ISSUED s2 (block_idx=0) -t=300: STOP → is_tracking=False -t=400: ISSUED s3 (untracked, is_tracking=False) -t=600: COMPLETE s1 → block 0: last_complete=600, completed=1 -t=700: COMPLETE s2 → block 0: last_complete=700, completed=2 -t=800: START → block 1 (start=800) -t=900: ISSUED s4 (block_idx=1) -t=1000: COMPLETE s4 → block 1: last_complete=1000, completed=1 - -block 0: duration = 700 - 0 = 700, samples = 2 -block 1: duration = 1000 - 800 = 200, samples = 1 -total_tracked_duration = 900 -QPS = 3 / 900 -``` - -## Data Model: SampleRow - -A `msgspec.Struct` with `gc=False` — no methods, no trigger awareness: - -```python -class SampleRow(msgspec.Struct, gc=False): - sample_uuid: str - tracked_block_idx: int = -1 # -1 = untracked (should never happen with current gate) - issued_ns: int | None = None - recv_first_ns: int | None = None - last_recv_ns: int | None = None - client_send_ns: int | None = None - client_resp_done_ns: int | None = None - complete_ns: int | None = None - first_chunk_token_count: int | None = None # set async by FirstChunkTokenCountTrigger -``` - -Compared to the previous design: - -- **Dropped**: `prompt_text` (ISL trigger reads from `ev_rec.data` directly), - `first_chunk_text` (replaced by `first_chunk_token_count`, computed eagerly at - RECV_FIRST), `output_chunks` (COMPLETE carries full output via TextModelOutput). -- **Added**: `tracked_block_idx` (links sample to its tracking window), - `first_chunk_token_count` (pre-computed token count for TPOT denominator). -- `gc=False` is safe: no mutable container fields that could form reference cycles. - -## Metrics Computed - -### Timing Metrics (sync triggers, emitted immediately) - -| Metric | Trigger | Field | Formula | -| --------------------- | ------------------------ | --------------------- | -------------------------------------------------------------------------------- | -| `ttft_ns` | `TtftTrigger` | `recv_first_ns` | `ev_rec.timestamp_ns - pre_change["issued_ns"]` | -| `chunk_delta_ns` | `ChunkDeltaTrigger` | `last_recv_ns` | `ev_rec.timestamp_ns - pre_change["last_recv_ns"]` (skips if pre-change is None) | -| `request_duration_ns` | `RequestDurationTrigger` | `client_resp_done_ns` | `ev_rec.timestamp_ns - pre_change["client_send_ns"]` | -| `sample_latency_ns` | `SampleLatencyTrigger` | `complete_ns` | `ev_rec.timestamp_ns - pre_change["issued_ns"]` | - -### Token Metrics (async triggers, fire tasks) - -| Metric | Trigger | Field | Source | -| -------------------------- | ----------------------------- | --------------- | -------------------------------------------------------------------------------- | -| `isl` | `IslTrigger` | `issued_ns` | `len(ev_rec.data.token_ids)` (sync) or `token_count(ev_rec.data.text)` (async) | -| `_first_chunk_token_count` | `FirstChunkTokenCountTrigger` | `recv_first_ns` | `token_count(str(ev_rec.data))` → stored on `row.first_chunk_token_count` | -| `osl`, `tpot_ns` | `OslTpotTrigger` | `complete_ns` | `token_count(str(ev_rec.data))` for OSL; TPOT uses `row.first_chunk_token_count` | - -## Data Flow - -```mermaid -flowchart TB - E[EventRecord from socket] --> D{Event type?} - - D -->|Session event| S["table.handle_session_event(ev_rec)"] - S --> S1[Update is_tracking / session_started_ns] - S --> S2[Open TrackedBlock on START] - - D -->|ENDED| SD[Set shutdown flag] - - D -->|Sample event| SF["table.set_field(uuid, field, value, ev_rec)"] - SF --> LC{ISSUED?} - LC -->|Yes + tracking on| CR[Create row, assign block_idx] - LC -->|Yes + tracking off| Skip[No-op] - LC -->|No| LU[Look up existing row] - LU --> LU2{Row exists?} - LU2 -->|No| Skip - LU2 -->|Yes| FT[Fire triggers] - CR --> FT - - FT --> Set[Set field on row] - Set --> CC{COMPLETE?} - CC -->|Yes| UB[Update tracked block] - UB --> RM[Remove row] - CC -->|No| Done[Done] - - SD --> Drain[Await in-flight tasks] - Drain --> Fin[Flush + close emitter] -``` - -### Execution Sequence - -```mermaid -sequenceDiagram - participant Socket as ZMQ SUB - participant Agg as Aggregator - participant Table as MetricsTable - participant Trigger as EmitTrigger - participant Pool as TokenizePool - participant Em as Emitter - - Socket->>Agg: batch of EventRecords - - Note over Agg: Session events - Agg->>Table: handle_session_event(ev_rec) - - Note over Agg: Sample events - Agg->>Table: set_field(uuid, field, value, ev_rec) - Table->>Table: lookup / create row - Table->>Trigger: fire(ev_rec, row, pre_change) - - alt Sync trigger - Trigger->>Em: emit(metric_name, value) - else Async trigger - Trigger->>Pool: token_count_async (creates Task) - Pool-->>Em: emit(metric_name, value) - Note over Table: Task tracked in _in_flight_tasks - end - - Table->>Table: setattr(row, field, value) - - Note over Agg: On ENDED - Agg->>Table: drain_tasks() - Table->>Table: await gather(*_in_flight_tasks) - Agg->>Em: flush() + close() -``` - -### Shutdown Sequence - -```mermaid -stateDiagram-v2 - [*] --> Listening: start() - Listening --> Processing: events arrive - Processing --> Listening: batch done - - Processing --> ShuttingDown: ENDED in batch - note right of ShuttingDown: shutdown flag set,
no new tasks fired - - ShuttingDown --> Draining: await table.drain_tasks() - Draining --> Finalized: all tasks complete - Finalized --> [*]: emitter flushed/closed,
shutdown_event set -``` - -## MetricEmitter - -The `MetricEmitter` ABC defines: - -```python -class MetricEmitter(ABC): - def emit(self, sample_uuid: str, metric_name: str, value: int | float) -> None: ... - def flush(self) -> None: ... - def close(self) -> None: ... -``` - -`emit()` has a None guard: if the emitter is closed, the call is a silent no-op. -This protects against late-arriving async tasks that complete after shutdown. - -### JsonlMetricEmitter (current implementation) - -Writes one JSON line per metric: - -```json -{"sample_uuid":"a1b2c3...","metric_name":"ttft_ns","value":1500,"timestamp_ns":98765432100} -{"sample_uuid":"a1b2c3...","metric_name":"sample_latency_ns","value":4000,"timestamp_ns":98765436100} -``` - -Uses `msgspec.json.Encoder` for serialization. Supports a configurable `flush_interval` -(flush to disk every N records). `timestamp_ns` is the wall-clock time when the metric -was emitted (not the event timestamp). - -### Future: PrometheusEmitter - -Would push to Prometheus PushGateway. The `emit()` signature supports this — -`metric_name` maps to a Prometheus metric, `sample_uuid` becomes a label, -`value` is the observation. Histograms/summaries can be built by accumulating -values per metric name. - -## TokenizePool - -Thread-pool wrapper around HuggingFace `AutoTokenizer` for ISL/OSL/TPOT computation. - -``` - TokenizePool - ┌─────────────┐ - │ ThreadPool │ - token_count("text")──► Executor │ - (blocking) │ ┌───────┐ │ - │ │Thread1│──► thread-local AutoTokenizer - │ │Thread2│──► thread-local AutoTokenizer - │ │ ... │ │ - │ └───────┘ │ - └─────────────┘ -``` - -- Each worker thread has its own `AutoTokenizer` via `threading.local()`, bound - to the pool instance. All threads are pre-warmed during `__init__`. -- HuggingFace tokenizers (Rust backend) release the GIL during tokenization, - so threads run in true parallel. -- `token_count_async()` wraps the blocking call in `loop.run_in_executor()` to - avoid blocking the event loop. -- Only `token_count()` / `token_count_async()` are exposed. The `tokenize()` - method (returning token strings) was removed — all metrics only need counts. - -## ISL Tracking: How the Prompt Gets to the Aggregator - -The `ISSUED` event's `data` field carries a `PromptData` struct with either: - -- `text: str` — raw prompt string (OpenAI path), tokenized async by `IslTrigger`. -- `token_ids: tuple[int, ...]` — pre-tokenized IDs (SGLang path), - ISL is `len(token_ids)` with no tokenization needed. - -The trigger reads directly from `ev_rec.data` — no prompt text is stored on -`SampleRow`. - -| Adapter | `sample.data` at ISSUED | `PromptData` | -| ----------------------- | ----------------------------------- | ------------------------------------------- | -| OpenAI / OpenAI-Msgspec | `{"prompt": "...", "model": "..."}` | `PromptData(text=prompt)` | -| SGLang | `{"input_tokens": [int, ...]}` | `PromptData(token_ids=tuple(input_tokens))` | - -## Lifecycle - -### Startup - -``` -python -m inference_endpoint.async_utils.services.metrics_aggregator \ - --metrics-dir /tmp/metrics \ - --socket-dir /tmp/socket_dir \ - --socket-name ev_pub_abc123 \ - --tokenizer gpt2 \ - --tokenizer-workers 2 -``` - -1. Create `TokenizePool` (if `--tokenizer` provided). -2. Create `JsonlMetricEmitter` writing to `/metrics.jsonl`. -3. Create `MetricsAggregatorService` — constructs `MetricsTable` and registers - all triggers with bound runtime deps. -4. `aggregator.start()` adds the ZMQ socket reader to the event loop. -5. `await shutdown_event.wait()` blocks until ENDED is received. - -### Shutdown - -On `SessionEventType.ENDED`: - -1. Set `_shutdown_received` flag — no new events are processed. -2. `await table.drain_tasks()` — wait for all in-flight async trigger tasks. -3. `_finalize()` — flush and close the emitter. -4. `shutdown_event.set()` — unblock the main coroutine. -5. `TokenizePool.close()` — shut down worker threads (in `finally` block). - -### Graceful Drain - -Events before ENDED in the same batch are processed; events after are dropped. -In-flight async tasks (ISL tokenization, OSL/TPOT computation) are awaited -before the emitter is closed, ensuring no metrics are lost. - -In-flight samples that never receive COMPLETE are abandoned — their rows remain -but no metrics are emitted. This is expected when the session ends. - -## Output Format - -### JSONL Example (streaming sample) - -```json -{"sample_uuid":"a1b2c3d4","metric_name":"isl","value":42,"timestamp_ns":100000000} -{"sample_uuid":"a1b2c3d4","metric_name":"ttft_ns","value":1500000,"timestamp_ns":100001500} -{"sample_uuid":"a1b2c3d4","metric_name":"chunk_delta_ns","value":500000,"timestamp_ns":100002000} -{"sample_uuid":"a1b2c3d4","metric_name":"chunk_delta_ns","value":600000,"timestamp_ns":100002600} -{"sample_uuid":"a1b2c3d4","metric_name":"request_duration_ns","value":3800000,"timestamp_ns":100003800} -{"sample_uuid":"a1b2c3d4","metric_name":"sample_latency_ns","value":4000000,"timestamp_ns":100004000} -{"sample_uuid":"a1b2c3d4","metric_name":"osl","value":28,"timestamp_ns":100004001} -{"sample_uuid":"a1b2c3d4","metric_name":"tpot_ns","value":92592.6,"timestamp_ns":100004001} -``` - -### JSONL Example (non-streaming sample) - -```json -{"sample_uuid":"e5f6a7b8","metric_name":"isl","value":15,"timestamp_ns":200000000} -{"sample_uuid":"e5f6a7b8","metric_name":"request_duration_ns","value":2500000,"timestamp_ns":200002500} -{"sample_uuid":"e5f6a7b8","metric_name":"sample_latency_ns","value":3000000,"timestamp_ns":200003000} -{"sample_uuid":"e5f6a7b8","metric_name":"osl","value":50,"timestamp_ns":200003001} -``` - -Note: no `ttft_ns`, `chunk_delta_ns`, or `tpot_ns` for non-streaming — these require -`RECV_FIRST` which only occurs in streaming mode. - -## Not Yet Wired - -The EventRecord pub/sub infrastructure is ready, but actual `publish(EventRecord(...))` -calls for sample events have not been connected in the load generator or worker -processes. What needs to happen: - -1. **Load generator** (`load_generator.py` / `session.py`): Publish `ISSUED` with - prompt text, `START/STOP_PERFORMANCE_TRACKING`, `STARTED`, `ENDED`. -2. **Worker** (`worker.py`): Publish `CLIENT_SEND`, `CLIENT_RESP_DONE`, - `RECV_FIRST`, `RECV_NON_FIRST`, `COMPLETE` with response data. -3. **Session orchestrator**: Spawn the metrics aggregator subprocess alongside - the event logger subprocess, passing the same ZMQ socket address. diff --git a/docs/commands/DESIGN.md b/docs/commands/DESIGN.md new file mode 100644 index 00000000..930b270c --- /dev/null +++ b/docs/commands/DESIGN.md @@ -0,0 +1,152 @@ +# Commands — Design Spec + +> Thin execution layer that maps Cyclopts CLI inputs to concrete command handlers and benchmark execution code. It owns dispatch and user-facing command boundaries, not core benchmarking logic. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · **commands** · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +The command layer is split across: + +- `main.py` for top-level app setup, global flags, simple commands, and error-to-exit-code handling +- `commands/benchmark/cli.py` for the `benchmark` subcommands (`offline`, `online`, `from-config`) +- `commands/benchmark/execute.py` for benchmark setup, execution, and finalization +- One module per simple command: `probe.py`, `info.py`, `validate.py`, `init.py` + +Cyclopts constructs typed config objects directly from CLI arguments, so command functions receive +already-parsed models rather than raw `argparse.Namespace` objects. + +## Responsibilities + +- Register CLI commands and subcommands +- Translate typed CLI inputs into command execution calls +- Keep benchmark execution flow separate from CLI declaration +- Surface validation, setup, execution, and CLI errors through stable exit codes + +## Command Map + +| Subcommand | Entry point | Execution module | Status | +| ----------------------- | --------------------------- | ------------------------------- | ------------------------- | +| `benchmark offline` | `commands/benchmark/cli.py` | `commands/benchmark/execute.py` | Implemented | +| `benchmark online` | `commands/benchmark/cli.py` | `commands/benchmark/execute.py` | Implemented | +| `benchmark from-config` | `commands/benchmark/cli.py` | `commands/benchmark/execute.py` | Implemented | +| `probe` | `main.py` | `commands/probe.py` | Implemented | +| `info` | `main.py` | `commands/info.py` | Implemented | +| `validate-yaml` | `main.py` | `commands/validate.py` | Implemented | +| `init` | `main.py` | `commands/init.py` | Implemented | +| `eval` | `main.py` | inline stub (`CLIError`) | Reserved, not implemented | + +## CLI Structure + +``` +inference-endpoint + | + +-- global launcher in main.py + | - applies -v / --verbose + | - configures logging + | - dispatches into Cyclopts app + | + +-- benchmark + | +-- offline + | +-- online + | +-- from-config + | + +-- probe + +-- info + +-- validate-yaml + +-- init + +-- eval +``` + +`benchmark` is registered lazily from `commands/benchmark/cli.py`, keeping startup light for +simple commands like `info` and `validate-yaml`. + +## `benchmark` Command Flow + +``` +CLI / YAML input + | + v +Cyclopts + | + +-- offline / online: + | construct OfflineBenchmarkConfig / OnlineBenchmarkConfig + | pass repeatable --dataset strings separately + | + +-- from-config: + | load YAML path + | BenchmarkConfig.from_yaml_file() + | optionally apply --timeout / --mode overrides + | + v +commands/benchmark/cli.py::_run() + | + +-- inject CLI dataset strings via config.with_updates(datasets=...) + +-- normalize dataset validation errors + | + v +commands/benchmark/execute.py::run_benchmark() + | + +-- prepare report dir and runtime context + +-- load datasets + +-- construct endpoint client + sample issuer + +-- run BenchmarkSession in threaded wrapper + +-- finalize metrics and optional accuracy scoring +``` + +## `probe` Command + +`probe` is a lightweight connectivity check built on the same endpoint/client stack as the main +benchmark path. It issues a small number of synthetic prompts, then reports success rate, latency, +and sample responses. Its purpose is to validate endpoint reachability and request formatting +before launching a full benchmark. + +## Utility Commands + +| Command | What it does | +| --------------- | -------------------------------------------------------------- | +| `info` | Prints local system and environment information | +| `validate-yaml` | Loads a YAML config and runs schema validation | +| `init` | Copies a config template from `config/templates/` into the cwd | + +## Design Decisions + +**Cyclopts models are the CLI boundary** + +The command layer does not parse raw strings manually unless a flag is intentionally free-form, +such as repeatable `--dataset` values. Most arguments are parsed straight into Pydantic models +defined in `config/schema.py`, which keeps command handlers small and pushes field validation to +the schema layer. + +**Benchmark declaration and execution are split** + +`commands/benchmark/cli.py` owns subcommand shape and input normalization. `commands/benchmark/execute.py` +owns the multi-phase benchmark lifecycle. This keeps the CLI definition readable while allowing the +execution path to grow without turning the CLI module into orchestration code. + +**Simple commands stay in `main.py` when they are thin** + +Top-level commands with small signatures (`info`, `init`, `validate-yaml`, `probe`) are registered +directly in `main.py` and delegate immediately to their implementation modules. That keeps the app +topology visible in one place without introducing extra wrapper files. + +**`eval` is intentionally reserved** + +The `eval` command is exposed in help output but still raises `CLIError` with a tracking issue +link. The benchmark +path already supports dataset-specific accuracy evaluation, but the standalone `eval` command has +not been implemented yet. + +## Integration Points + +| Dependency | Role | +| --------------------------- | ---------------------------------------------------------------- | +| `main.py` | App definition, logging setup, global error handling | +| `config/` | Defines CLI/YAML schema models and config loading | +| `dataset_manager/` | Loads performance and accuracy datasets | +| `endpoint_client/` | Sends requests to endpoint workers | +| `load_generator/session.py` | Runs the benchmark session | +| `metrics/` | Aggregates and reports benchmark results | +| `evaluation/` | Scores collected accuracy datasets during benchmark finalization | diff --git a/docs/config/DESIGN.md b/docs/config/DESIGN.md new file mode 100644 index 00000000..795efbb6 --- /dev/null +++ b/docs/config/DESIGN.md @@ -0,0 +1,152 @@ +# Config — Design Spec + +> Parses YAML and CLI configuration, exposes ruleset extension points, and produces the immutable `RuntimeSettings` object that drives downstream components. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · **config** · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`config/` translates user-provided configuration (CLI arguments or YAML files) into validated +config models and, from those, an immutable `RuntimeSettings` object that drives the rest of the +system. It also provides the extension point for competition rulesets, though full ruleset-driven +runtime construction is still incomplete in the current execution path. + +## Responsibilities + +- Validate and parse YAML configuration with Pydantic +- Expose YAML templates for common benchmark patterns +- Define ruleset extension points (MLCommons and future competitions) +- Produce an immutable `RuntimeSettings` from validated config + +## Component Map + +``` +CLI args / YAML file + │ + ▼ + BenchmarkConfig (Pydantic — schema.py) + │ + ▼ + RuntimeSettings.from_config() + │ + +── optional future ruleset integration + ▼ + RuntimeSettings (frozen dataclass) ←─── drives all downstream components +``` + +## Key Types + +### `BenchmarkConfig` (Pydantic model) + +Top-level YAML schema. Most nested fields have defaults, but the top-level config still requires +the benchmark `type` and `endpoint_config`. + +Key nested models: + +| Model | Purpose | +| ---------------- | --------------------------------------------------- | +| `LoadPattern` | Pattern type + parameters (target QPS, concurrency) | +| `RuntimeConfig` | Duration, sample count, RNG seeds | +| `ClientSettings` | Worker count and HTTP client settings | +| `EndpointConfig` | Endpoint URLs, API key | +| `Dataset` | Dataset path, type (performance / accuracy) | + +### `RuntimeSettings` (frozen dataclass) + +Immutable snapshot of all parameters needed to execute a run. + +| Field | Type | Source | +| -------------------- | -------------- | --------------------------------------- | +| `load_pattern` | `LoadPattern` | config | +| `n_samples_to_issue` | `int` | calculated: QPS × duration, or explicit | +| `min_duration_ms` | `int` | runtime config | +| `max_duration_ms` | `int` | runtime config | +| `min_sample_count` | `int` | current default / future ruleset hook | +| `metric_target` | `Metric` | primary target driving scheduler logic | +| `reported_metrics` | `list[Metric]` | metrics validated after the run | +| `rng_sched` | `Random` | seeded from `scheduler_random_seed` | +| `rng_sample_index` | `Random` | seeded from `dataloader_random_seed` | + +Once constructed, `RuntimeSettings` cannot be modified. All consumers receive the same instance. + +### `BenchmarkSuiteRuleset` (abstract base) + +Extension point for competition-specific constraints. + +```python +class BenchmarkSuiteRuleset(ABC): + version: str + + @abstractmethod + def apply_user_config(self, *args, **kwargs) -> RuntimeSettings: + ... +``` + +Implementations override `apply_user_config()` to enforce minimum durations, sample counts, +required metrics, and fixed RNG seeds. The MLCommons ruleset lives in `rulesets/mlcommons/`. +That interface exists today, but `RuntimeSettings.from_config()` still uses the default conversion +path even when a ruleset object is supplied. + +Rulesets are registered in `ruleset_registry.py` by name string (e.g. `"mlperf-inference-v5.1"`). + +## Key Enums + +| Enum | Values | +| ----------------- | ------------------------------------------ | +| `APIType` | `OPENAI`, `SGLANG` | +| `LoadPatternType` | `MAX_THROUGHPUT`, `POISSON`, `CONCURRENCY` | +| `DatasetType` | `PERFORMANCE`, `ACCURACY` | +| `TestMode` | `PERF`, `ACC`, `BOTH` | +| `TestType` | `OFFLINE`, `ONLINE`, `EVAL`, `SUBMISSION` | +| `StreamingMode` | `AUTO`, `ON`, `OFF` | + +## YAML Templates + +Pre-built templates are stored in `config/templates/`: + +| Template | Use case | +| --------------------------- | ---------------------------- | +| `offline_template.yaml` | Max-throughput offline run | +| `online_template.yaml` | Poisson online run | +| `concurrency_template.yaml` | Fixed-concurrency online run | +| `eval_template.yaml` | Accuracy evaluation | +| `submission_template.yaml` | Official MLPerf submission | + +Generated via `inference-endpoint init `. `concurrency_template.yaml` exists on disk, but the +current `init` command exposes only `offline`, `online`, `eval`, and `submission`. + +## Design Decisions + +**Pydantic for config, frozen dataclass for runtime** + +Pydantic is used at the boundary (file parsing, CLI parsing) where untrusted input arrives and +validation error messages matter. `RuntimeSettings` is a frozen dataclass because it carries no +untrusted data and must not change after construction. Using Pydantic in the hot path would add +unnecessary overhead. + +**Ruleset is the intended strategy object** + +The ruleset pattern keeps competition-specific constraints out of the core benchmark logic. Adding +a new ruleset requires only implementing `BenchmarkSuiteRuleset` and registering the class — no +changes to `BenchmarkSession` or the CLI. The abstraction is present, but the live benchmark path +has not fully delegated runtime construction to rulesets yet. + +**Reproducibility via explicit seeds** + +`RuntimeSettings` contains two seeded `Random` instances: one for scheduler timing jitter +(`rng_sched`) and one for dataset sample ordering (`rng_sample_index`). These make the runtime +configuration reproducible in principle, but the original seed values are not currently persisted +to the report output. + +## Integration Points + +| Consumer | Usage | +| ------------------------------- | ------------------------------------------------------------ | +| `load_generator/session.py` | Receives `RuntimeSettings` at construction | +| `load_generator/scheduler.py` | Reads `load_pattern`, `n_samples_to_issue`, RNG seeds | +| `endpoint_client/config.py` | Reads `api_type`, `num_workers`, streaming mode | +| `metrics/reporter.py` | Reads `reported_metrics`, duration bounds | +| `commands/benchmark/cli.py` | Defines benchmark subcommands and resolves CLI vs YAML input | +| `commands/benchmark/execute.py` | Runs the benchmark lifecycle from resolved configuration | diff --git a/docs/core/DESIGN.md b/docs/core/DESIGN.md new file mode 100644 index 00000000..99062ab9 --- /dev/null +++ b/docs/core/DESIGN.md @@ -0,0 +1,115 @@ +# Core Types — Design Spec + +> Shared `msgspec.Struct` data structures used across transport, endpoint adapters, and metrics, with small helper methods and auto-managed timing fields. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · **core** · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`core/` defines the fundamental data structures passed between all system components. Every other package +depends on these types; they depend on nothing else in the project. + +## Responsibilities + +- Define the wire format for queries, results, and streaming chunks +- Provide a single source of truth for status and output representation +- Minimize serialization overhead on the hot path + +## Key Types + +### `Query` + +Represents a single inference request issued to an endpoint. + +| Field | Type | Description | +| ------------ | ---------------- | ------------------------------------------------- | +| `id` | `str` | UUID string for result correlation (auto-set) | +| `data` | `dict[str, Any]` | Request payload (prompt, model, params, etc.) | +| `headers` | `dict[str, str]` | HTTP headers (e.g., authorization) | +| `created_at` | `float` | Epoch timestamp when query was created (auto-set) | + +The adapter layer (`openai/`, `sglang/`) is responsible for structuring `data` — `Query` itself is format-agnostic. + +### `QueryResult` + +Represents a completed (success or failure) inference response. + +| Field | Type | Description | +| ----------------- | -------------------------------- | --------------------------------------------------------------- | +| `id` | `str` | Matches originating `Query.id` | +| `response_output` | `TextModelOutput \| str \| None` | Response content (None on error; plain `str` remains supported) | +| `metadata` | `dict[str, Any]` | Additional response metadata (token counts, etc.) | +| `error` | `ErrorData \| None` | Structured error if query failed | +| `completed_at` | `int` | Monotonic timestamp in nanoseconds (auto-set) | + +### `StreamChunk` + +Represents one SSE delta from a streaming response. + +| Field | Type | Description | +| ---------------- | ---------------- | ------------------------------ | +| `id` | `str` | Matches originating `Query.id` | +| `response_chunk` | `str` | Incremental token text | +| `is_complete` | `bool` | True for the final chunk | +| `metadata` | `dict[str, Any]` | Per-chunk metadata | + +### `TextModelOutput` + +Holds the final model response text and optional reasoning trace. + +| Field | Type | Description | +| ----------- | -------------------------------- | ---------------------------------------------------------------------- | +| `output` | `str \| tuple[str, ...]` | Decoded text; tuple for streaming accumulation | +| `reasoning` | `str \| tuple[str, ...] \| None` | Optional reasoning trace; tuple when accumulated from streaming chunks | + +### Supporting Types + +`core/types.py` also defines `PromptData` (attached to issued events for token metrics) and +`ErrorData` (structured error payloads used on `QueryResult.error`). + +### `QueryStatus` + +Enum: `PENDING` → `RUNNING` → `COMPLETED` / `FAILED` / `CANCELLED` + +## Design Decisions + +**`msgspec.Struct` with `frozen=True`, `array_like=True`, `gc=False`, `omit_defaults=True`** + +All four flags are deliberate hot-path optimisations: + +- `frozen=True` prevents accidental mutation after creation. +- `array_like=True` serialises to a JSON array (positional fields) rather than a dict, cutting wire size. +- `gc=False` removes the type from GC tracking; structs with no cyclic references don't need it. +- `omit_defaults=True` reduces serialised size for optional fields. + +Field mutation is prohibited. Use `msgspec.structs.force_setattr()` only in controlled accumulator code. + +**Minimal helper logic on otherwise transport-oriented types** + +The core structs are primarily data containers, but they do include small helper behaviors where +the implementation needs them: `QueryResult.completed_at` is auto-set in `__post_init__`, +`TextModelOutput.__str__()` flattens output for reporting, and `TextModelOutput.text_after_first_chunk()` +supports TPOT calculation. + +## Serialisation Contract + +Types are serialised with `msgspec.json.encode()` and decoded with `msgspec.json.decode()`. +Because `array_like=True`, the wire format is positional: + +``` +Query → ["", {}, {}, ] +``` + +Field order is determined by struct definition order and must not be changed without a migration. + +## Integration Points + +| Consumer | Usage | +| ------------------------ | ----------------------------------------------------------------------------------- | +| `endpoint_client/` | Creates `Query`; receives `QueryResult` and `StreamChunk` | +| `load_generator/` | Passes `Query` to `SampleIssuer`; routes `QueryResult`/`StreamChunk` to event hooks | +| `async_utils/transport/` | Serialises/deserialises these types over ZMQ IPC | +| `metrics/recorder.py` | Reads `id` and timing fields for event recording | +| `openai/`, `sglang/` | Constructs `QueryResult` and `StreamChunk` from API responses | diff --git a/docs/dataset_manager/DESIGN.md b/docs/dataset_manager/DESIGN.md new file mode 100644 index 00000000..5650e601 --- /dev/null +++ b/docs/dataset_manager/DESIGN.md @@ -0,0 +1,154 @@ +# Dataset Manager — Design Spec + +> Loads benchmark datasets from local files and HuggingFace sources and applies ordered transform pipelines to produce request-ready samples for the load generator. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · **dataset_manager** · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`dataset_manager/` loads benchmark datasets from various sources and applies transformation +pipelines to produce request-ready samples. It decouples dataset format (how data is stored) +from model and adapter requirements (how data must be shaped). + +## Responsibilities + +- Load samples from JSONL, JSON, CSV, Parquet, and HuggingFace sources +- Apply ordered transform pipelines to adapt raw rows to API format +- Provide a uniform `Dataset` interface regardless of source or format +- Register built-in (predefined) datasets by name for ruleset use + +## Component Map + +``` +DataLoaderFactory + | + +-- format -> DatafileLoader subclass + | (jsonl / json / csv / parquet / hf) + | | + | v + | raw DataFrame + | | + +-- transforms -> Transform pipeline + | + v + Dataset (load_sample / num_samples) +``` + +## Public Interface + +### `Dataset` + +Concrete base class. Subclasses register themselves in `Dataset.PREDEFINED` via +`__init_subclass__`. + +```python +class Dataset: + PREDEFINED: ClassVar[dict[str, type["Dataset"]]] # name → subclass registry + + def load_sample(self, index: int) -> Any: ... + def num_samples(self) -> int: ... + + repeats: int = 1 + # When repeats > 1, the dataset wraps around after num_samples() +``` + +`load_sample()` typically returns a `dict`, but the return type is `Any` — dataset schemas vary +widely and are not enforced at the base class level. + +### `DataLoaderFactory` + +```python +class DataLoaderFactory: + @staticmethod + def create_loader( + config: DatasetConfig, num_repeats: int = 1, **kwargs + ) -> Dataset: ... +``` + +`config` is the `Dataset` Pydantic model from `config/schema.py`; it carries path, format, +parser/remap config, and dataset name. Format is inferred from file extension when +`config.format` is not set: + +- `.jsonl` → `JSONL` +- `.json` → `JSON` +- `.csv` → `CSV` +- `.parquet` → `PARQUET` +- explicit `format=huggingface` → `HF` + +Presets (e.g. `"gpqa::Qwen/Qwen3-8B"`) are encoded in `config.name` as a `"::"` split — the +factory resolves them to a predefined dataset class with a model-specific transform stack. + +### `Transform` (abstract base) + +```python +class Transform(ABC): + @abstractmethod + def __call__(self, df: pd.DataFrame) -> pd.DataFrame: ... +``` + +Transforms are composed in order; each receives the output of the previous. + +## Built-in Transforms + +| Transform | Purpose | +| ----------------------- | ------------------------------------------------------ | +| `ColumnRemap` | Rename columns (e.g. `question` -> `prompt`) | +| `UserPromptFormatter` | Apply format string to produce the `prompt` column | +| `MakeAdapterCompatible` | Ensure columns match what `HttpRequestAdapter` expects | + +## Predefined Datasets + +Registered in `dataset.py` under `Dataset.PREDEFINED`. Referenced by name in rulesets and YAML +configs. Each predefined dataset ships with default transforms for supported model families. + +| Name | Source | Notes | +| --------------------------- | ------------- | ------------------------------------------ | +| `aime25` | AIME 2025 | Math reasoning | +| `gpqa` | GPQA Diamond | Science QA | +| `cnndailymail` | CNN/DailyMail | Summarization | +| `open_orca` | OpenOrca | General instruction | +| `livecodebench` | LiveCodeBench | Code generation; requires additional setup | +| `shopify_product_catalogue` | Shopify | E-commerce Q&A (q3vl) | +| `random` | Synthetic | Generated prompts for throughput testing | + +## Preset System + +A preset string like `"gpqa::Qwen/Qwen3-8B"` resolves to a predefined dataset with a +model-specific transform stack pre-applied. This is used by rulesets to ensure consistent +prompt formatting across submissions. + +## Design Decisions + +**Transforms are separate from datasets** + +The same raw dataset can be used with different models (each with different prompt templates) or +different API adapters (OpenAI vs SGLang). Keeping transforms out of the dataset class means +neither the dataset nor the adapter has to know about the other. + +**Format inference from extension** + +Reducing friction for CLI users is a priority. Specifying `--dataset my_data.jsonl` should just +work. For non-standard sources such as HuggingFace datasets, callers can set the dataset +`format` explicitly in YAML or in the repeatable `--dataset ...,format=huggingface` string. + +**`load_sample()` returns a dict, not a typed struct** + +Dataset schemas vary widely (different columns, optional fields). A dict interface avoids a +proliferation of dataset-specific types while still being easily introspectable and debuggable. +The adapter layer (`openai/openai_adapter.py`) is responsible for reading the expected keys. + +**`repeats` for issuing more samples than the dataset size** + +When `n_samples_to_issue > num_samples()`, the dataset wraps. Index arithmetic (`index % +num_samples()`) is handled by the Dataset base class. This avoids duplicating the logic in every +scheduler. + +## Integration Points + +| Consumer | Usage | +| ---------------------------------- | ------------------------------------------------------------- | +| `load_generator/load_generator.py` | Calls `load_sample(index)` for each scheduled query | +| `config/rulesets/mlcommons/` | References predefined datasets by name | +| `commands/benchmark/` | Constructs dataset via `DataLoaderFactory` from CLI/YAML args | diff --git a/docs/endpoint_client/DESIGN.md b/docs/endpoint_client/DESIGN.md new file mode 100644 index 00000000..c3f98ba3 --- /dev/null +++ b/docs/endpoint_client/DESIGN.md @@ -0,0 +1,177 @@ +# Endpoint Client — Design Spec + +> Multi-process HTTP worker pool that sends queries to inference endpoints over persistent connections and delivers responses back to the load generator with minimal latency overhead. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · **endpoint_client** · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`endpoint_client/` sends queries to remote inference endpoints and delivers responses back to the +load generator. It uses a **multi-process worker pool** communicating over ZMQ IPC to bypass the +GIL and sustain high request rates. + +This file is the primary component-level design spec for the endpoint client. For deeper +implementation detail on the connection pool, worker internals, SSE handling, and performance +analysis, see the companion deep-dive document +[ENDPOINT_CLIENT.md](../ENDPOINT_CLIENT.md). + +For detailed CPU affinity configuration and tuning parameters, see +[CLIENT_PERFORMANCE_TUNING.md](../CLIENT_PERFORMANCE_TUNING.md) and +[PERF_ARCHITECTURE.md](../PERF_ARCHITECTURE.md). + +## Responsibilities + +- Spawn and manage a pool of worker processes +- Route outbound queries to workers via round-robin +- Deliver inbound responses (`QueryResult`, `StreamChunk`) to callers +- Manage persistent TCP connections per worker +- Apply CPU affinity for NUMA-aware placement + +## Architecture + +``` +Main Process +┌─────────────────────────────────────────┐ +│ HTTPEndpointClient │ +│ ├── uvloop event loop │ +│ └── WorkerManager │ +│ └── WorkerPoolTransport (ZMQ) │ +└──────────────┬──────────────────────────┘ + │ ZMQ IPC (inproc/ipc) + ┌──────────┴──────────┐ + │ │ +Worker 0 Worker N +┌──────────┐ ┌──────────┐ +│ uvloop │ │ uvloop │ +│ Worker │ │ Worker │ +│ ConnPool │ │ ConnPool │ +└──────┬───┘ └──────┬───┘ + │ HTTP/1.1 │ + └─────────┬──────────┘ + Endpoint +``` + +## Public Interface + +### `HTTPEndpointClient` + +```python +class HTTPEndpointClient: + def __init__(self, config: HTTPClientConfig, ...) -> None + + # Sample issuer interface + def issue(self, query: Query) -> None # non-blocking, round-robin + def shutdown(self) -> None # synchronous shutdown + + # Response retrieval (use one pattern per call-site) + def poll(self) -> QueryResult | StreamChunk | None # non-blocking + async def recv(self) -> QueryResult | StreamChunk | None # blocking async + def drain(self) -> list[QueryResult | StreamChunk] # batch +``` + +### `HTTPClientConfig` + +```python +class HTTPClientConfig(WithUpdatesMixin, BaseModel): + endpoint_urls: list[str] + api_type: APIType = APIType.OPENAI + api_key: str | None = None + num_workers: int = -1 # -1 = auto (NUMA-aware) + worker_gc_mode: Literal["disabled", "relaxed", "system"] = "relaxed" + max_idle_time: float = 4.0 + warmup_connections: int = -1 # -1 = auto (50% of max_connections) + max_connections: int = -1 # -1 = bound by ephemeral port limit + stream_all_chunks: bool = False # expose every SSE chunk (perf cost) + cpu_affinity: AffinityPlan | None = None +``` + +## Data Flow + +**Outbound (issue → endpoint):** + +``` +HTTPEndpointClient.issue(query) + → select next worker (round-robin index) + → serialize Query with msgspec.json + → ZMQ PUSH to worker socket + → Worker receives query + → HttpRequestAdapter formats HTTP request + → ConnectionPool acquires connection + → HTTP/1.1 request sent +``` + +**Inbound (endpoint → caller):** + +``` +HTTP response received by Worker + → HttpResponseProtocol (httptools parser) + → Accumulator builds QueryResult / StreamChunk + → ZMQ PUSH result back to main process + → WorkerPoolTransport routes to response queue + → HTTPEndpointClient.recv() / poll() / drain() +``` + +## Key Components + +### `ConnectionPool` + +Maintains a persistent TCP connection pool per worker. Connections can be warmed up before the +benchmark starts to reduce cold-start latency. Idle connections are evicted after +`max_idle_time` seconds. + +### `HttpResponseProtocol` + +`asyncio.Protocol` implementation using `httptools.HttpResponseParser` (llhttp C parser). Handles +both streaming (SSE) and non-streaming responses. Connections are reused between requests via +`reset()` without re-establishing TCP. + +### `WorkerManager` + +Spawns worker processes via `multiprocessing.Process`. Monitors liveness with periodic checks +during startup. Applies CPU affinity via `AffinityPlan` after all workers are alive. + +### `HttpRequestAdapter` (abstract base class) + +Converts a `Query` to a raw HTTP request bytes. Implementations: + +- `openai/openai_msgspec_adapter.py` — fast path using msgspec +- `sglang/adapter.py` — SGLang-specific format + +## Design Decisions + +**Multi-process over multi-thread** + +Python's GIL prevents true parallelism in a threaded HTTP client at high QPS. Worker processes +each own a uvloop event loop and a connection pool, achieving genuine concurrency. ZMQ IPC +has lower overhead than inter-process queues or sockets for this pattern. + +**Round-robin dispatch (not work-stealing)** + +Round-robin is O(1) and avoids contention on a shared queue. Workers have equal capacity, so +skewed distribution is not a concern in practice. + +**`httptools` over `aiohttp`/`httpx`** + +`httptools` is the same C parser used by Node.js (llhttp). It exposes a callback API that feeds +directly into the asyncio protocol, eliminating intermediate buffering. `aiohttp` and `httpx` add +abstraction layers that increase latency variance. + +**`stream_all_chunks=False` by default** + +Exposing every SSE chunk requires passing each through the ZMQ transport, adding per-chunk +serialisation cost. By default the client still forwards the first chunk for TTFT measurement, +suppresses intermediate chunks, and then returns the final assembled `QueryResult` at end of +stream. Enable `stream_all_chunks` only when callers need every chunk, not just TTFT and the +final response. + +## Integration Points + +| Dependency | Role | +| ------------------------ | ------------------------------------------------------ | +| `core/types.py` | `Query` in, `QueryResult`/`StreamChunk` out | +| `async_utils/transport/` | ZMQ IPC between main process and workers | +| `openai/`, `sglang/` | `HttpRequestAdapter` and accumulator implementations | +| `load_generator/` | Provides the `SampleIssuer` ABC consumed by the client | +| `config/` | `HTTPClientConfig` derived from `RuntimeSettings` | diff --git a/docs/evaluation/DESIGN.md b/docs/evaluation/DESIGN.md new file mode 100644 index 00000000..ee7135ba --- /dev/null +++ b/docs/evaluation/DESIGN.md @@ -0,0 +1,96 @@ +# Evaluation — Design Spec + +> Scores model responses against ground-truth answers after an accuracy benchmark run; extracts answers from raw response text and supports LiveCodeBench code-execution evaluation via an external sandboxed server. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · **evaluation** · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`evaluation/` scores model responses against ground-truth answers for accuracy benchmarks. +It is invoked after a benchmark run that collected responses (i.e. `--mode acc` or `--mode both`). +Today that orchestration happens in `commands/benchmark/execute.py`, not in `metrics/`. + +## Responsibilities + +- Extract model answers from raw response text +- Score extracted answers against ground truth +- Support LiveCodeBench code execution evaluation (requires external server) + +## Component Map + +``` +QueryResult.response_output (raw response text) + | + v +extractor.py --> extracted answer string + | + v +scoring.py --> correct / incorrect (per sample) + | + v +accuracy summary written into benchmark results +``` + +## Files + +| File | Purpose | +| ---------------- | ----------------------------------------------------------------- | +| `extractor.py` | Extracts model answer from raw text (regex, boxed-answer parsing) | +| `scoring.py` | Compares extracted answer to ground truth label | +| `livecodebench/` | LiveCodeBench-specific code execution pipeline | + +## LiveCodeBench + +LiveCodeBench requires a sandboxed code execution server. The `livecodebench/` subdirectory +contains the server implementation and a Dockerfile. See +`src/inference_endpoint/evaluation/livecodebench/README.md` for setup +instructions. + +Files: + +- `_server.py` — FastAPI server that executes submitted code +- `lcb_serve.py` — Server management utilities +- `generate.py` — Response generation utilities +- `run_lcb_tests.py` — Test runner for LCB evaluation +- `lcb_serve.dockerfile` — Docker image for the execution server + +## Scoring Methods + +The scorer registry in `evaluation/scoring.py` currently includes: + +| Method | Description | +| --------------------- | -------------------------------------------------------------- | +| `pass_at_1` | Exact-match style scoring; also used by the LiveCodeBench path | +| `string_match` | Whitespace-trimmed string equality | +| `rouge` | ROUGE-based text generation scoring | +| `code_bench_scorer` | LiveCodeBench code-execution scoring | +| `shopify_category_f1` | Shopify category F1 evaluation | + +The scoring configuration used by benchmark execution is specified per accuracy dataset under +`datasets[].accuracy_config`, including `accuracy_config.eval_method`, +`accuracy_config.extractor`, and optional `accuracy_config.ground_truth`. + +## Design Decisions + +**Extraction is separate from scoring** + +Model responses for tasks like GPQA often embed the answer in verbose reasoning text. Extraction +(finding the answer in the text) and scoring (comparing the answer) are separate concerns. +Different datasets may share a scoring method but require different extraction logic. + +**LiveCodeBench requires an external service** + +Code execution cannot be done safely in-process. The evaluation server runs in a Docker container +with resource limits. This is a deliberate architecture choice — not a shortcut — and is +documented prominently in the dataset README. + +## Integration Points + +| Component | Role | +| ---------------------------------- | ------------------------------------------------------ | +| `commands/benchmark/execute.py` | Builds scorer/extractor configs and runs scoring | +| `dataset_manager/predefined/` | Provides ground truth labels alongside prompts | +| `evaluation/livecodebench/` | Provides external execution path for LiveCodeBench | +| `results.json` / benchmark reports | Receives computed accuracy summary during finalization | diff --git a/docs/load_generator/DESIGN.md b/docs/load_generator/DESIGN.md new file mode 100644 index 00000000..27182d60 --- /dev/null +++ b/docs/load_generator/DESIGN.md @@ -0,0 +1,152 @@ +# Load Generator — Design Spec + +> Central orchestrator for a benchmark run: controls what samples to issue, when to issue them via pluggable schedulers, and routes completion events to the metrics recorder. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · **load_generator** · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`load_generator/` is the central orchestrator for a benchmark run. It controls **what** to send +(dataset samples), **when** to send them (load pattern), and **how** to observe the results +(event hooks feeding the metrics recorder). + +## Responsibilities + +- Manage the full benchmark session lifecycle (start → run → drain → report) +- Implement timing strategies: max throughput, Poisson, fixed concurrency +- Emit structured events for every sample state transition +- Coordinate graceful shutdown with in-flight drain + +## Component Map + +``` +BenchmarkSession ← top-level owner; runs on background thread + └── SchedulerBasedLoadGenerator ← iterates (sample_index, delay_ns) pairs + ├── Scheduler ← determines timing + │ ├── MaxThroughputScheduler (offline: all at t=0) + │ ├── PoissonDistributionScheduler (online: exp inter-arrival) + │ └── ConcurrencyScheduler (online: fixed in-flight count) + └── SampleIssuer (ABC) ← sends the query; implemented by endpoint_client/ +``` + +## Public Interface + +### `BenchmarkSession` + +```python +@classmethod +def start( + cls, + runtime_settings: RuntimeSettings, + dataset: Dataset, + sample_issuer: SampleIssuer, + scheduler: Scheduler, + *args, + accuracy_datasets: list[Dataset] | None = None, + load_generator_cls: type[LoadGenerator] = SchedulerBasedLoadGenerator, + name: str | None = None, + max_shutdown_timeout_s: float | None = None, + report_dir: os.PathLike | None = None, + tokenizer_override: AutoTokenizer | None = None, + dump_events_log: bool = False, +) -> "BenchmarkSession" + +def wait_for_test_end(self, timeout: float | None = None) -> bool +def stop(self) -> None +``` + +`start()` spawns the run thread immediately. `wait_for_test_end()` blocks the caller until the +session finishes or the timeout expires. `stop()` signals early termination. + +### `SampleIssuer` (abstract base class — implemented externally) + +```python +def start() -> None +def issue(sample: Sample) -> None +def shutdown() -> None +``` + +`SampleIssuer` is an `ABC`, not a structural protocol. `start()` and `shutdown()` have default +no-op implementations; subclasses must implement `issue()`. `issue()` must be non-blocking; +responses are delivered asynchronously via `SampleEventHandler`. + +### `Scheduler` (base class) + +```python +def __iter__(self) -> Iterator[tuple[int, int]] +# yields (sample_index, delay_ns) +``` + +Subclasses register themselves via `__init_subclass__(load_pattern=LoadPatternType.X)` and are +looked up at construction time. + +## Data Flow + +``` +BenchmarkSession._run_test() + │ + ├─ for (index, delay_ns) in SchedulerBasedLoadGenerator: + │ busy_wait(delay_ns) + │ sample = load_sample_data(index) + │ SampleIssuer.issue(sample) → async, fire-and-forget + │ + └─ wait_for_drain() ← blocks until all in-flight complete + │ + └─ SampleEventHandler routes completions: + FIRST_CHUNK → recorder.record_event(SampleEvent.FIRST_CHUNK) + COMPLETE → recorder.record_event(SampleEvent.COMPLETE) +``` + +## Design Decisions + +**Busy-wait for timing precision** + +`SchedulerBasedLoadGenerator` uses a busy-wait loop (`while time.monotonic_ns() < target_ns`) for +inter-sample delays rather than `asyncio.sleep()` or `time.sleep()`. This achieves sub-millisecond +timing accuracy at high QPS without introducing event-loop latency. The trade-off is elevated CPU +usage on the scheduling thread during the run. + +**Thread-based session, not async** + +`BenchmarkSession._run_test()` runs on a `threading.Thread`, not a coroutine. The scheduler loop +is blocking by design — it must not yield to the event loop, which could introduce scheduling jitter. +The async event loop is owned by `HTTPEndpointClient`, not the load generator. + +**`SampleEventHandler` singleton with registered hooks** + +All sample-level events (FIRST_CHUNK, COMPLETE, etc.) route through a single global +`_SampleEventHandler`. Hooks are registered before the run starts and remain constant for its +duration. This eliminates per-sample dispatch overhead at runtime. + +**`ConcurrencyScheduler` coordination via `threading.Condition`** + +The concurrency scheduler blocks issuance when in-flight count reaches the target, then wakes +via a Condition notified by the COMPLETE hook. This provides back-pressure without polling. + +## Event Types + +| Event | Enum type | Meaning | +| --------------------------- | -------------- | --------------------------------------- | +| `TEST_STARTED` | `SessionEvent` | Run begins | +| `STOP_PERFORMANCE_TRACKING` | `SessionEvent` | Performance issuance phase has ended | +| `LOADGEN_STOP` | `SessionEvent` | Load generator finished issuing samples | +| `TEST_ENDED` | `SessionEvent` | Run complete | +| `LOADGEN_ISSUE_CALLED` | `SessionEvent` | `issue()` called | +| `LOADGEN_DATA_LOAD` | `SessionEvent` | Sample payload loaded from dataset | +| `HTTP_REQUEST_ISSUED` | `SampleEvent` | Request sent to endpoint | +| `HTTP_RESPONSE_COMPLETED` | `SampleEvent` | Endpoint HTTP response fully received | +| `FIRST_CHUNK` | `SampleEvent` | First SSE chunk received | +| `NON_FIRST_CHUNK` | `SampleEvent` | Subsequent SSE chunk | +| `COMPLETE` | `SampleEvent` | Final result received | + +## Integration Points + +| Dependency | Role | +| ---------------------------- | ---------------------------------------------------------- | +| `core/types.py` | `Query`, `QueryResult`, `StreamChunk` | +| `endpoint_client/` | Implements `SampleIssuer` | +| `metrics/recorder.py` | Receives all events via `SampleEventHandler` | +| `config/runtime_settings.py` | `RuntimeSettings` drives duration, sample count, RNG seeds | +| `dataset_manager/` | Provides `Dataset` for sample data | diff --git a/docs/metrics/DESIGN.md b/docs/metrics/DESIGN.md new file mode 100644 index 00000000..94b880be --- /dev/null +++ b/docs/metrics/DESIGN.md @@ -0,0 +1,175 @@ +# Metrics — Design Spec + +> Records per-sample timing events to SQLite during a run (write path), then aggregates them into QPS, latency percentiles, TTFT, and TPOT after the run (read path). + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · **metrics** · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`metrics/` records benchmark events during a run and aggregates them into performance metrics +afterwards. It is split into two parts with a clean boundary: `EventRecorder` writes; nothing +else does. `MetricsReporter` reads. + +## Responsibilities + +- Persist every timing event to SQLite during the run (write path) +- Aggregate events into QPS, latency percentiles, TTFT, and TPOT after the run (read path) +- Validate results against metric targets from the active ruleset +- Produce human-readable console output and machine-readable JSON reports + +## Component Map + +``` +SampleEventHandler ──► EventRecorder (SQLite, queue-backed) + │ + ▼ + MetricsReporter + │ + ┌───────────┴──────────┐ + ▼ ▼ + console output JSON report +``` + +## Public Interface + +### `EventRecorder` + +Only one `EventRecorder` may be actively writing at a time per process. The live instance is +accessible via the class variable `EventRecorder.LIVE`. + +```python +class EventRecorder: + LIVE: "EventRecorder | None" # class variable; set on construction, cleared on close + + @classmethod + def record_event( + cls, + ev_type: Event, + timestamp_ns: int, + sample_uuid: str = "", + force_commit: bool = False, + assert_active: bool = True, + data: Any = None, + ) -> bool + + def wait_for_writes(self, force_commit: bool = True) -> None + # Blocks until the background writer thread has flushed all queued events + + @staticmethod + def db_path(session_id: str) -> Path +``` + +`record_event()` is a **classmethod** and is **non-blocking**: events are placed on a queue and +written by a background thread. Returns `True` if recorded, `False` if no recorder is active +(when `assert_active=False`). + +### `MetricsReporter` + +```python +class MetricsReporter: + def __init__( + self, + connection_name: os.PathLike, + client_type: str = "duckdb", + ) -> None + + def create_report( + self, + tokenizer: Tokenizer | None = None, + tpot_reporting_mode: TPOTReportingMode = TPOTReportingMode.REQUEST_WEIGHTED, + ) -> Report + + def dump_to_json(self, json_path: Path) -> None +``` + +`create_report()` executes SQL aggregation over the events database and returns a `Report` +object. The optional `tokenizer` enables output sequence length (OSL) computation. `Report` +itself exposes `display(fn=print, summary_only=False)` for console output. + +### Metric Types (`metric.py`) + +```python +class Throughput(Metric): + REL_TOL = 0.1 # ±10% relative tolerance + def __init__(self, target_qps: float): ... # stored as self.target + +class QueryLatency(Metric): + REL_TOL = 0.1 + def __init__(self, target_latency_ms: float | None = None, + target_qps: float | None = None): ... + +class TTFT(Metric): + def __init__(self, max_ttft_latency_ms: float): ... # hard ceiling + +class TPOT(Metric): + def __init__(self, max_tpot_latency_ms: float): ... # hard ceiling +``` + +Each metric exposes `is_valid(measurement) -> bool`. The target value is stored as +`self.target` on the base `Metric` class. + +## Data Flow + +### Write Path (during run) + +``` +SampleEventHandler.query_result_complete(result) + → EventRecorder.record_event( + SampleEvent.COMPLETE, + time.monotonic_ns(), + sample_uuid=result.id, + data={...}, + ) + → queue.put(EventRow(...)) # non-blocking + → background thread: INSERT INTO events +``` + +### Read Path (after run) + +``` +MetricsReporter.create_report() + → SELECT / GROUP BY on events table (DuckDB) + → compute percentiles (p50, p90, p99, p999) + → compute TTFT = time from LOADGEN_ISSUE_CALLED to FIRST_CHUNK + → compute TPOT = (COMPLETE.ts - FIRST_CHUNK.ts) / output_tokens + → compute tracked duration from TEST_STARTED / STOP_PERFORMANCE_TRACKING windows + → compute QPS = tracked completed samples / tracked duration + → validate each metric against RuntimeSettings.reported_metrics +``` + +## Design Decisions + +**SQLite as the event store** + +SQLite gives durable, queryable storage with no external dependencies. The write path uses a +single background writer thread (SQLite's WAL mode is single-writer) to avoid contention. +Aggregation uses DuckDB for columnar SQL performance over the file written by SQLite. + +**Queue-backed writes to decouple hot path** + +The `record_event()` call must not block the load generator thread. Events are placed on a +`queue.Queue` and consumed by a dedicated writer thread. The queue is unbounded; back-pressure +is not a concern because write throughput (SQLite) exceeds event rate in all tested scenarios. + +**Singleton enforcement** + +Only one `EventRecorder` may exist per process. The singleton is enforced at construction time +with a class-level flag. This prevents double-counting if code accidentally constructs a second +recorder. + +**TPOT calculation modes** + +TPOT can be weighted by request (each request contributes equally) or by output token count +(each token contributes equally). The default is request-weighted. The `TPOTReportingMode` enum +controls this at report time without re-running the benchmark. + +## Integration Points + +| Component | Role | +| ---------------------------- | --------------------------------------------------------- | +| `load_generator/sample.py` | Calls `record_event()` for every state transition | +| `load_generator/session.py` | Calls `create_report()` at run end; saves output | +| `config/runtime_settings.py` | `reported_metrics` list drives which metrics are computed | +| `config/ruleset_base.py` | Provides `Metric` targets for validation | diff --git a/docs/openai/DESIGN.md b/docs/openai/DESIGN.md new file mode 100644 index 00000000..880ae7b7 --- /dev/null +++ b/docs/openai/DESIGN.md @@ -0,0 +1,125 @@ +# OpenAI Adapter — Design Spec + +> Translates internal `Query` objects into OpenAI Chat Completions HTTP requests and parses streaming SSE and non-streaming JSON responses back into `QueryResult`/`StreamChunk`. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · **openai** · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`openai/` adapts the system's internal `Query` type to OpenAI-compatible HTTP requests and +converts OpenAI API responses back into `QueryResult` and `StreamChunk`. It also provides +adapter-specific dataset transforms through the `HttpRequestAdapter` interface. + +## Responsibilities + +- Format `Query.data` into OpenAI Chat Completions request bodies +- Parse streaming SSE responses (chunked) into `StreamChunk` events +- Parse non-streaming JSON responses into `QueryResult` +- Provide a msgspec-optimised fast path for the hot path + +## Component Map + +``` +Query + | + v +HttpRequestAdapter (ABC) + | + openai_msgspec_adapter.py <-- hot path: msgspec encoding, no json.dumps + openai_adapter.py <-- general path + | + v +raw HTTP request bytes + | + v +HTTP response bytes + | + v +SSEAccumulatorProtocol (Protocol) + | + accumulator.py <-- assembles StreamChunk stream into QueryResult + | + v +QueryResult / StreamChunk +``` + +## Public Interface + +### `HttpRequestAdapter` (ABC, defined in `endpoint_client/adapter_protocol.py`) + +```python +class HttpRequestAdapter(ABC): + @classmethod + def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]: ... + + @classmethod + def encode_query(cls, query: Query) -> bytes: ... + + @classmethod + def decode_response(cls, response_bytes: bytes, query_id: str) -> QueryResult: ... + + @classmethod + def decode_sse_message(cls, json_bytes: bytes) -> str: ... +``` + +`dataset_transforms()` returns adapter-specific transforms that shape dataset rows into the +expected `Query.data` schema. `encode_query()` serialises a `Query` to HTTP request bytes. +`decode_response()` parses a non-streaming response. `decode_sse_message()` extracts the content +string from a single SSE JSON payload; `parse_sse_chunk()` (concrete, on the base class) iterates +the SSE buffer and calls it repeatedly. + +### `SSEAccumulatorProtocol` (protocol, defined in `endpoint_client/accumulator_protocol.py`) + +```python +class SSEAccumulatorProtocol(Protocol): + def __init__(self, query_id: str, stream_all_chunks: bool) -> None: ... + def add_chunk(self, delta: Any) -> StreamChunk | None: ... + def get_final_output(self) -> QueryResult: ... +``` + +Workers construct a fresh accumulator for each streaming request by passing the request ID and +the `stream_all_chunks` mode. `add_chunk()` processes one API-specific SSE delta and returns a +`StreamChunk` when content should be emitted (None otherwise). `get_final_output()` returns the +assembled `QueryResult` after the stream is complete, so state is isolated per request rather +than shared across a connection. + +## Key Files + +| File | Purpose | +| --------------------------- | ------------------------------------------------------------ | +| `openai_msgspec_adapter.py` | Hot-path adapter; uses msgspec for request encoding | +| `openai_adapter.py` | Standard adapter; uses stdlib json | +| `accumulator.py` | Per-request streaming accumulator for OpenAI SSE deltas | +| `types.py` | Python type annotations for OpenAI response objects | +| `openai_types_gen.py` | Auto-generated from `openapi.yaml`; do not edit manually | +| `harmony.py` | Optional `openai-harmony` integration for compatibility shim | +| `openapi.yaml` | OpenAI API spec snapshot; excluded from pre-commit | + +## Design Decisions + +**msgspec adapter as the default hot path** + +`openai_msgspec_adapter.py` encodes requests using `msgspec.json.encode()` rather than +`json.dumps()`. At 50k+ QPS with small request bodies, the encoding time is measurable. +msgspec is 2-5x faster than stdlib json for typical Chat Completions request shapes. + +**Fresh accumulator per request** + +Workers construct a new accumulator for each streaming request. This keeps the accumulator +interface small (`add_chunk()` / `get_final_output()`) and avoids having to manage explicit +reset semantics across reused connections. + +**`openai_types_gen.py` is auto-generated** + +OpenAI type definitions are generated from the official OpenAPI spec. Manual edits would be +overwritten on regeneration. The file is excluded from ruff and pre-commit. + +## Integration Points + +| Component | Role | +| --------------------------- | ------------------------------------------------------- | +| `endpoint_client/worker.py` | Calls `encode_query()` and `accumulator.add_chunk()` | +| `endpoint_client/config.py` | Selects `openai_msgspec_adapter` when `api_type=OPENAI` | +| `core/types.py` | `StreamChunk`, `QueryResult` are the output types | diff --git a/docs/plugins/DESIGN.md b/docs/plugins/DESIGN.md new file mode 100644 index 00000000..31bc89ed --- /dev/null +++ b/docs/plugins/DESIGN.md @@ -0,0 +1,41 @@ +# Plugins — Design Spec + +> Reserved extension namespace for registering custom `HttpRequestAdapter`, `DatafileLoader`, and `BenchmarkSuiteRuleset` implementations without modifying core package code. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · **plugins** · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`plugins/` is the extension point for adding custom adapters, dataset loaders, rulesets, or +other integrations without modifying core package code. + +## Responsibilities + +- Reserve a stable namespace for future plugin APIs +- Point readers to the concrete registries that exist today + +## Current State + +The plugin system is a reserved namespace. `plugins/__init__.py` currently contains only +placeholder documentation and does not yet expose a public registration interface. No built-in +plugins exist. When the first real plugin requirement arrives, the registration mechanism will +be implemented with a concrete use case. + +## Design Decisions + +**Namespace reservation over premature framework** + +The `plugins/` directory signals that extensibility is a first-class concern, without committing +to a specific plugin discovery mechanism (e.g. `importlib.metadata` entry points, config-based +loading). When the first real plugin requirement arrives, the mechanism can be chosen with a +concrete use case. + +## Integration Points + +| Extensible component | Current registration target | +| ----------------------- | -------------------------------------------- | +| `HttpRequestAdapter` | `endpoint_client/config.py` adapter registry | +| `DatafileLoader` | `dataset_manager/dataset.py` format registry | +| `BenchmarkSuiteRuleset` | `config/ruleset_registry.py` | diff --git a/docs/profiling/DESIGN.md b/docs/profiling/DESIGN.md new file mode 100644 index 00000000..062a0c3f --- /dev/null +++ b/docs/profiling/DESIGN.md @@ -0,0 +1,66 @@ +# Profiling — Design Spec + +> `line_profiler` integration with a zero-cost no-op decorator by default; activated via the `ENABLE_LINE_PROFILER=1` environment variable for line-level timing of hot-path functions. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · **profiling** · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`profiling/` integrates `line_profiler` into the benchmark run and provides a pytest plugin for +profiling during test execution. It is a developer tool with no effect on production runs unless +explicitly enabled. + +## Responsibilities + +- Wrap functions with `line_profiler.LineProfiler` for line-level timing +- Emit profiling output at the end of a benchmark or test run +- Provide a pytest plugin that activates profiling when `ENABLE_LINE_PROFILER=1` is set + +## Files + +| File | Purpose | +| ---------------------------- | ------------------------------------------------ | +| `line_profiler.py` | `profile()` decorator and `dump_stats()` utility | +| `pytest_profiling_plugin.py` | pytest plugin; hooks into test session lifecycle | + +## Usage + +```python +from inference_endpoint.profiling import profile + +@profile +def hot_function(...): + ... +``` + +When profiling is inactive (default), `@profile` is a no-op. When active, it wraps the function +with `LineProfiler` and accumulates timing across all calls. + +In tests: + +```bash +ENABLE_LINE_PROFILER=1 pytest tests/unit/... +``` + +## Design Decisions + +**No-op decorator by default** + +Importing `@profile` from `profiling/` is safe in production code. When profiling is not +enabled, the decorator returns the original function unchanged. This means profiling annotations +can remain in hot-path code without any runtime cost. + +**`ENABLE_LINE_PROFILER` env var for selective activation** + +Setting `ENABLE_LINE_PROFILER=1` activates profiling for the process in question. This avoids +permanently modifying the code; `@profile` annotations can remain in hot-path code without any +runtime cost when the env var is unset. + +## Integration Points + +| Consumer | Usage | +| ------------------------------------- | ------------------------------------------------------------- | +| `endpoint_client/`, `load_generator/` | `@profile` annotations on hot-path functions | +| pytest | Plugin registered via `pytest_plugins` in `tests/conftest.py` | diff --git a/docs/sglang/DESIGN.md b/docs/sglang/DESIGN.md new file mode 100644 index 00000000..bd6c386d --- /dev/null +++ b/docs/sglang/DESIGN.md @@ -0,0 +1,53 @@ +# SGLang Adapter — Design Spec + +> SGLang-specific adapter implementing the same `HttpRequestAdapter` and `SSEAccumulatorProtocol` contracts as the OpenAI path, with SGLang wire format for requests and responses. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · **sglang** · [testing](../testing/DESIGN.md) · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`sglang/` is a thin adapter implementing the same `HttpRequestAdapter` and +`SSEAccumulatorProtocol` contracts as `openai/`, targeting the SGLang-specific API format. +It is structurally parallel to `openai/` and follows the same patterns. + +## Responsibilities + +- Format `Query` dicts into SGLang-compatible HTTP request bodies +- Parse SGLang streaming and non-streaming responses into `QueryResult` / `StreamChunk` + +## Files + +| File | Purpose | +| ---------------- | --------------------------------------------------- | +| `adapter.py` | `HttpRequestAdapter` implementation for SGLang | +| `accumulator.py` | `SSEAccumulatorProtocol` for SGLang response format | +| `types.py` | Python type annotations for SGLang response objects | + +## Public Interface + +Identical protocols to `openai/` — the adapter implements `dataset_transforms()`, +`encode_query()`, `decode_response()`, and `decode_sse_message()`, while the accumulator +implements `add_chunk()` and `get_final_output()`. The only difference is the wire format of +requests and responses. + +## Design Decisions + +**Shared protocol, separate implementation** + +The `HttpRequestAdapter` and `SSEAccumulatorProtocol` protocols are defined in +`endpoint_client/adapter_protocol.py` and `endpoint_client/accumulator_protocol.py` respectively. +Both `openai/` and `sglang/` implement these protocols independently. `endpoint_client/config.py` +selects the appropriate implementation at construction time based on `api_type`. + +This means adding a new API format (e.g. TGI, vLLM native) requires only implementing the two +protocols and registering the implementation in `config.py` — no changes to worker or client code. + +## Integration Points + +| Component | Role | +| --------------------------- | --------------------------------------------- | +| `endpoint_client/config.py` | Selects SGLang adapter when `api_type=SGLANG` | +| `endpoint_client/worker.py` | Same call sites as OpenAI adapter | +| `core/types.py` | Output types are identical to OpenAI path | diff --git a/docs/testing/DESIGN.md b/docs/testing/DESIGN.md new file mode 100644 index 00000000..e5cbd9cb --- /dev/null +++ b/docs/testing/DESIGN.md @@ -0,0 +1,80 @@ +# Testing Utilities — Design Spec + +> Standalone local server implementations — echo, max-throughput, and variable-throughput — that substitute for real inference endpoints during local development and CI. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · **testing** · [utils](../utils/DESIGN.md) + +--- + +## Overview + +`testing/` provides local server implementations that mimic inference endpoints. They allow +the full benchmark stack to be exercised without a real GPU or remote service. + +## Responsibilities + +- Provide an OpenAI-compatible echo server for local functional testing +- Provide a configurable throughput server for performance testing +- Provide a Docker-based server launcher for integration test environments + +## Servers + +### `echo_server.py` + +Mirrors the request prompt back as the response. Used for: + +- Verifying end-to-end benchmark plumbing (CLI → client → server → metrics) +- Testing streaming and non-streaming response paths +- CI integration tests + +```bash +python3 -m inference_endpoint.testing.echo_server --port 8765 +python3 -m inference_endpoint.testing.echo_server --host 0.0.0.0 --port 9000 +``` + +The server implements the OpenAI Chat Completions API and accepts the standard `messages` request +shape. It is intended for functional testing of the request/response path, not for configurable +latency simulation. + +### `max_throughput_server.py` + +Returns minimal valid responses as fast as possible. Used for: + +- Measuring the upper bound of client throughput (removes server as a bottleneck) +- Performance regression testing of the HTTP client and transport layer + +### `variable_throughput_server.py` + +Returns responses at a configurable rate. Used for: + +- Testing scheduler behaviour under varying server latency +- Validating Poisson and concurrency scheduler correctness + +### `docker_server.py` + +Manages a Docker container running a real or simulated inference server. Used for: + +- Integration tests that require a more realistic server environment +- Automated test setup without manual container management + +## Design Decisions + +**Echo server uses the real OpenAI API format** + +The echo server accepts the full Chat Completions request shape, not a simplified subset. This +ensures that integration tests exercise the actual adapter code path (prompt formatting, header +generation) rather than a shortcut. + +**Servers are not test fixtures** + +These servers are standalone Python modules, not pytest fixtures. They can be run from the +command line independently of any test framework. Pytest fixtures in `tests/conftest.py` +(`mock_http_echo_server`, `mock_http_oracle_server`) wrap them for test use. + +## Integration Points + +| Consumer | Usage | +| ----------------------- | ------------------------------------------------------ | +| `tests/conftest.py` | Wraps `echo_server` as `mock_http_echo_server` fixture | +| `docs/LOCAL_TESTING.md` | Step-by-step guide for manual testing with echo server | +| `docs/DEVELOPMENT.md` | References echo server for local development workflow | diff --git a/docs/utils/DESIGN.md b/docs/utils/DESIGN.md new file mode 100644 index 00000000..0a49de10 --- /dev/null +++ b/docs/utils/DESIGN.md @@ -0,0 +1,50 @@ +# Utils — Design Spec + +> Shared helpers (logging setup, version, tokenizer utilities) and a standalone HTTP benchmarking tool. The core helper modules have no dependencies on other project subpackages. + +**Component specs:** [async_utils](../async_utils/DESIGN.md) · [commands](../commands/DESIGN.md) · [config](../config/DESIGN.md) · [core](../core/DESIGN.md) · [dataset_manager](../dataset_manager/DESIGN.md) · [endpoint_client](../endpoint_client/DESIGN.md) · [evaluation](../evaluation/DESIGN.md) · [load_generator](../load_generator/DESIGN.md) · [metrics](../metrics/DESIGN.md) · [openai](../openai/DESIGN.md) · [plugins](../plugins/DESIGN.md) · [profiling](../profiling/DESIGN.md) · [sglang](../sglang/DESIGN.md) · [testing](../testing/DESIGN.md) · **utils** + +--- + +## Overview + +`utils/` contains shared utilities that do not belong to any specific component. The core of this +package is a set of stateless helper modules with no cross-component dependencies. +`benchmark_httpclient.py` is a standalone benchmarking tool that lives here for convenience but +does import from other `inference_endpoint` subpackages. + +## Files + +| File | Purpose | +| ------------------------- | ------------------------------------------------------------------------- | +| `logging.py` | Configures the root logger (format, level, handlers) | +| `version.py` | Exposes package version from `inference_endpoint.__version__` and git SHA | +| `dataset_utils.py` | Tokenizer inspection utilities (vocab stats, token length histograms) | +| `benchmark_httpclient.py` | Standalone HTTP throughput benchmarking utility (imports internals) | + +## Design Decisions + +**No cross-imports from `utils/` helper modules** + +`logging.py` and `dataset_utils.py` stay lightweight and broadly reusable. `version.py` is also +small, but it intentionally imports `inference_endpoint.__version__` and shells out to `git` to +report build metadata. `benchmark_httpclient.py` is exempt entirely: it is a standalone tool, not +a reusable helper. + +**`benchmark_httpclient.py` is a standalone tool** + +This module benchmarks the raw HTTP client throughput independent of the load generator and +scheduler. It is useful for diagnosing whether performance bottlenecks are in the client layer +or in the scheduling/coordination layer. For broader tuning guidance, see +[CLIENT_PERFORMANCE_TUNING.md](../CLIENT_PERFORMANCE_TUNING.md). It can be run directly: + +```bash +python3 -m inference_endpoint.utils.benchmark_httpclient --endpoint URL --workers 4 +``` + +## Integration Points + +| Consumer | Usage | +| ------------------ | -------------------------------------------- | +| `main.py` | Calls `setup_logging()` at startup | +| `commands/info.py` | Imports `__version__` for the `info` command | diff --git a/examples/README.md b/examples/README.md index 2b33d75e..5464e9d6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -24,6 +24,18 @@ Sample yaml configuration to benchmark `openai/gpt-oss-120b`. Sample yaml configuration to benchmark `meta-llama/Llama-3.1-8B-Instruct`. +### [06_Llama2-70B_Example](06_Llama2-70B_Example/) + +Sample yaml configuration for online benchmarking of `meta-llama/Llama-2-70b-chat-hf`. + +### [07_GPT-OSS-120B_SGLang_Example](07_GPT-OSS-120B_SGLang_Example/) + +End-to-end example for benchmarking `openai/gpt-oss-120b` served with SGLang, including helper scripts for AIME, GPQA, and LiveCodeBench evaluation. + +### [08_Qwen3-VL-235B-A22B_Example](08_Qwen3-VL-235B-A22B_Example/) + +Sample yaml configuration to benchmark the multimodal `Qwen/Qwen3-VL-235B-A22B` model on a visual reasoning workload. + ## Getting Help - For general usage: See main [README](../README.md) diff --git a/scripts/zmq_pubsub_demo.py b/scripts/zmq_pubsub_demo.py deleted file mode 100644 index ac310536..00000000 --- a/scripts/zmq_pubsub_demo.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -ZMQ Pub-Sub Demo using async_utils (EventRecord, EventPublisherService, etc.) - -Demonstrates the intended control flow: -- Publisher should be created within a ManagedZMQContext.scoped() context manager. -- Each subscriber has its own event loop (LoopManager.create_loop); init does NOT start processing. -- When ready, .start() is called on each subscriber to add the reader and begin receiving. -- process(records) is async and scheduled via create_task so it does not block the socket. -- Cleanup: .close() on subscribers when the session has ended. - -Same logical behavior as zmq_pubsub_simple_demo.py (console log, file output, duration stats) -but using the async_utils APIs and no extra queuing layer. - -Usage: - python scripts/zmq_pubsub_async_utils_demo.py -""" - -import asyncio -import logging -import time -import uuid -from pathlib import Path - -from inference_endpoint.async_utils.autoinit import LOOP_MANAGER -from inference_endpoint.async_utils.event_publisher import EventPublisherService -from inference_endpoint.async_utils.transport.record import ( - EventRecord, - SampleEventType, - SessionEventType, -) -from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext -from inference_endpoint.async_utils.transport.zmq.pubsub import ZmqEventRecordSubscriber - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(name)s] %(message)s", -) -logger = logging.getLogger(__name__) - - -# ============================================================================= -# Subscribers: each has its own loop and implements async process() -# ============================================================================= - - -class ConsoleSubscriber(ZmqEventRecordSubscriber): - """Logs events to console. process() is async and runs when records are received.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.event_count = 0 - - async def process(self, records: list[EventRecord]) -> None: - for event in records: - if event.event_type == SessionEventType.ENDED: - logger.info("[Console] Received shutdown signal (session.ended)") - self.event_count += 1 - sample_id = event.sample_uuid[:8] if event.sample_uuid else "N/A" - logger.info( - f"[Console] {event.event_type.topic} | {event.event_type.name} | " - f"sample={sample_id} | data={event.data}" - ) - - -class FileSubscriber(ZmqEventRecordSubscriber): - """Writes events to a file. process() is async.""" - - def __init__(self, output_file: Path, *args, **kwargs): - super().__init__(*args, **kwargs) - self.output_file = output_file - self.event_count = 0 - self._file = open(output_file, "w") - self._file.write("timestamp_ns,topic,event_type,sample_uuid,data\n") - - async def process(self, records: list[EventRecord]) -> None: - for event in records: - if event.event_type == SessionEventType.ENDED: - logger.info("[File] Received shutdown signal (session.ended)") - self.event_count += 1 - data_str = str(event.data) if event.data else "" - self._file.write( - f"{event.timestamp_ns},{event.event_type.topic}," - f"{event.event_type.name},{event.sample_uuid},{data_str}\n" - ) - self._file.flush() - - def close(self) -> None: - if not self.is_closed and hasattr(self, "_file") and self._file is not None: - try: - self._file.close() - except OSError: - # File may already be closed or I/O error on close (e.g. disk full). - pass - self._file = None - super().close() - - -class DurationSubscriber(ZmqEventRecordSubscriber): - """Tracks sample durations from ISSUED to COMPLETE. process() is async.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.start_times: dict[str, int] = {} - self.durations: dict[str, int] = {} - self.event_count = 0 - - async def process(self, records: list[EventRecord]) -> None: - for event in records: - if event.event_type == SessionEventType.ENDED: - logger.info("[Duration] Received shutdown signal (session.ended)") - self.event_count += 1 - if event.event_type == SampleEventType.ISSUED: - self.start_times[event.sample_uuid] = event.timestamp_ns - elif event.event_type == SampleEventType.COMPLETE: - if event.sample_uuid in self.start_times: - start_ns = self.start_times[event.sample_uuid] - duration_ns = event.timestamp_ns - start_ns - self.durations[event.sample_uuid] = duration_ns - sample_id = event.sample_uuid[:8] - logger.info( - f"[Duration] Sample {sample_id} completed in {duration_ns}ns" - ) - - def close(self) -> None: - if self.durations: - durations_ns = list(self.durations.values()) - avg_ns = sum(durations_ns) / len(durations_ns) - min_ns = min(durations_ns) - max_ns = max(durations_ns) - logger.info( - f"Duration stats: avg={avg_ns:.0f}ns, min={min_ns}ns, max={max_ns}ns" - ) - super().close() - - -# ============================================================================= -# Publish test events (same sequence as simple demo) -# ============================================================================= - - -async def publish_test_events(publisher) -> None: - """Publish hard-coded events using EventPublisherService and EventRecord.""" - logger.info("Waiting for subscribers to connect...") - await asyncio.sleep(0.5) - - logger.info("Publishing test events...") - uuid1 = uuid.uuid4().hex - uuid2 = uuid.uuid4().hex - uuid3 = uuid.uuid4().hex - - events: list[EventRecord] = [ - EventRecord( - event_type=SampleEventType.ISSUED, - timestamp_ns=10000, - sample_uuid=uuid1, - ), - EventRecord( - event_type=SampleEventType.ISSUED, - timestamp_ns=10003, - sample_uuid=uuid2, - ), - EventRecord( - event_type=SampleEventType.RECV_FIRST, - timestamp_ns=10010, - sample_uuid=uuid1, - data={"ttft_ms": 10.0}, - ), - EventRecord( - event_type=SampleEventType.RECV_FIRST, - timestamp_ns=10190, - sample_uuid=uuid2, - data={"ttft_ms": 187.0}, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10201, - sample_uuid=uuid1, - ), - EventRecord( - event_type=SampleEventType.ISSUED, - timestamp_ns=10202, - sample_uuid=uuid3, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10203, - sample_uuid=uuid1, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10210, - sample_uuid=uuid2, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10211, - sample_uuid=uuid1, - ), - EventRecord( - event_type=SampleEventType.COMPLETE, - timestamp_ns=10211, - sample_uuid=uuid1, - data={"tokens": 50}, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10214, - sample_uuid=uuid2, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10217, - sample_uuid=uuid2, - ), - EventRecord( - event_type=SampleEventType.RECV_NON_FIRST, - timestamp_ns=10219, - sample_uuid=uuid2, - ), - EventRecord( - event_type=SampleEventType.COMPLETE, - timestamp_ns=10219, - sample_uuid=uuid2, - data={"tokens": 75}, - ), - ] - - logger.info(f"Generated {len(events)} events for 3 samples") - logger.info(f"Sample UUIDs: {uuid1[:8]}, {uuid2[:8]}, {uuid3[:8]}") - - for i, event in enumerate(events, 1): - publisher.publish(event) - logger.info(f"Published event {i}/{len(events)}: {event.event_type.topic}") - await asyncio.sleep(0.05) - - shutdown_event = EventRecord( - event_type=SessionEventType.ENDED, - timestamp_ns=time.monotonic_ns(), - sample_uuid="", - ) - publisher.publish(shutdown_event) - logger.info("Sending shutdown signal (session.ended)...") - await asyncio.sleep(0.2) - logger.info("All events published") - - -# ============================================================================= -# Main -# ============================================================================= - - -async def main() -> None: - logger.info("=" * 80) - logger.info("ZMQ Pub-Sub Demo (async_utils)") - logger.info("=" * 80) - - output_file = Path("/tmp/zmq_events_async_utils_output.csv") - with ManagedZMQContext.scoped() as zmq_ctx: - publisher = EventPublisherService(zmq_ctx) - connect_path = publisher.bind_path - - # Each subscriber has its own event loop (not shared with the publisher). - console_loop = LOOP_MANAGER.create_loop("demo_console") - file_loop = LOOP_MANAGER.create_loop("demo_file") - duration_loop = LOOP_MANAGER.create_loop("demo_duration") - - logger.info("Creating subscribers (init does NOT start processing)...") - console_sub = ConsoleSubscriber( - path=connect_path, - zmq_context=zmq_ctx, - loop=console_loop, - topics=None, - ) - file_sub = FileSubscriber( - output_file, - path=connect_path, - zmq_context=zmq_ctx, - loop=file_loop, - topics=None, - ) - duration_sub = DurationSubscriber( - path=connect_path, - zmq_context=zmq_ctx, - loop=duration_loop, - topics=[ - SampleEventType.ISSUED.topic, - SampleEventType.COMPLETE.topic, - SessionEventType.ENDED.topic, - ], - ) - logger.info("Subscribers created") - - # Start listening (add reader to each loop). - logger.info("Starting subscribers (.start())...") - console_loop.call_soon_threadsafe(console_sub.start) - file_loop.call_soon_threadsafe(file_sub.start) - duration_loop.call_soon_threadsafe(duration_sub.start) - - try: - await publish_test_events(publisher) - except KeyboardInterrupt: - logger.info("Interrupted by user") - finally: - logger.info("Cleaning up (closing subscribers)...") - console_sub.close() - file_sub.close() - duration_sub.close() - logger.info("=" * 80) - logger.info(f"Output file written to: {output_file}") - logger.info("Demo complete") - logger.info("=" * 80) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/src/inference_endpoint/openai/openai_adapter.py b/src/inference_endpoint/openai/openai_adapter.py index 9dd57e50..5834d6b0 100644 --- a/src/inference_endpoint/openai/openai_adapter.py +++ b/src/inference_endpoint/openai/openai_adapter.py @@ -88,7 +88,7 @@ def decode_sse_message(cls, json_bytes: bytes) -> str: def to_endpoint_request(cls, query: Query) -> CreateChatCompletionRequest: """Convert a Query to an OpenAI request.""" if "prompt" not in query.data: - raise ValueError("prompt not found in json_value") + raise ValueError("prompt not found in query.data") messages = [{"role": Role5.user.value, "content": query.data["prompt"]}] if "system" in query.data: