Skip to content

Commit fb0eac5

Browse files
feat: bundle prompt suites in package, accept suite names in --prompts
Users can now pass --prompts reasoning instead of a file path. All 5 suites (37KB) ship with pip install infer-check. The resolver accepts both bundled names and file paths.
1 parent 90c1bf4 commit fb0eac5

10 files changed

Lines changed: 270 additions & 22 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ uv.lock
2525
htmlcov/
2626
.test_cache/
2727

28-
# infer-check results (large, generated)
28+
# infer-check results and caches (generated)
2929
results/
3030
*.report.html
3131
*.report.json

README.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,12 @@ infer-check sweep \
5050
8bit=mlx-community/Meta-Llama-3.1-8B-Instruct-8bit,\
5151
4bit=mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" \
5252
--backend mlx-lm \
53-
--prompts ./prompt-suites/reasoning.jsonl \
53+
--prompts reasoning \
5454
--output ./results/sweep/
5555
```
5656

57+
`--prompts` accepts either a bundled suite name (`reasoning`, `code`, `adversarial-numerics`, `determinism`, `long-context`) or a path to any `.jsonl` file.
58+
5759
The baseline is automatically run twice as a self-check — if it's not 50/50 identical, your comparison data is unreliable.
5860

5961
```
@@ -79,7 +81,7 @@ infer-check diff \
7981
--model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
8082
--backends "mlx-lm,openai-compat" \
8183
--base-urls ",http://localhost:8000" \
82-
--prompts ./prompt-suites/reasoning.jsonl \
84+
--prompts reasoning \
8385
--output ./results/diff/
8486
```
8587

@@ -93,7 +95,7 @@ Same prompt N times at temperature=0. Output should be bit-identical every run.
9395
infer-check determinism \
9496
--model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
9597
--backend mlx-lm \
96-
--prompts ./prompt-suites/determinism.jsonl \
98+
--prompts determinism \
9799
--runs 20 \
98100
--output ./results/determinism/
99101
```
@@ -107,7 +109,7 @@ infer-check stress \
107109
--model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
108110
--backend openai-compat \
109111
--base-url http://localhost:8000 \
110-
--prompts ./prompt-suites/reasoning.jsonl \
112+
--prompts reasoning \
111113
--concurrency 1,2,4,8 \
112114
--output ./results/stress/
113115
```
@@ -132,7 +134,7 @@ Curated prompts targeting known quantization failure modes:
132134
| `long-context.jsonl` | 10 | Tables and transcripts with recall questions |
133135
| `determinism.jsonl` | 50 | High-entropy continuations for determinism testing |
134136

135-
Custom suites are JSONL files: `{"id": "...", "text": "...", "category": "...", "max_tokens": N}` per line.
137+
All suites ship with the package — no need to clone the repo. Custom suites are JSONL files: `{"id": "...", "text": "...", "category": "...", "max_tokens": N}` per line.
136138

137139
## Supported backends
138140

src/infer_check/cli.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@
1414
console = Console()
1515

1616

17+
def _resolve_prompts(prompts: str) -> Path:
18+
"""Resolve a prompt suite name or path to an actual file path."""
19+
from infer_check.prompt_suites import get_suite_path
20+
21+
try:
22+
return get_suite_path(prompts)
23+
except FileNotFoundError as exc:
24+
raise click.BadParameter(str(exc)) from exc
25+
26+
1727
@click.group()
1828
@click.version_option(package_name="infer-check")
1929
def main() -> None:
@@ -39,8 +49,7 @@ def main() -> None:
3949
@click.option(
4050
"--prompts",
4151
required=True,
42-
type=click.Path(exists=True, dir_okay=False, path_type=Path),
43-
help="Path to JSONL prompt suite.",
52+
help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
4453
)
4554
@click.option(
4655
"--output",
@@ -58,7 +67,7 @@ def main() -> None:
5867
def sweep(
5968
models: str,
6069
backend: str,
61-
prompts: Path,
70+
prompts: str,
6271
output: Path,
6372
baseline: str | None,
6473
base_url: str | None,
@@ -112,7 +121,7 @@ def sweep(
112121
tag = " (baseline)" if label == baseline_label else ""
113122
console.print(f" {label}: {path}{tag}")
114123

115-
prompt_list = load_suite(prompts)
124+
prompt_list = load_suite(_resolve_prompts(prompts))
116125

117126
# Build a separate backend for each model
118127
backend_map: dict[str, Any] = {}
@@ -214,8 +223,7 @@ def sweep(
214223
@click.option(
215224
"--prompts",
216225
required=True,
217-
type=click.Path(exists=True, dir_okay=False, path_type=Path),
218-
help="Path to JSONL prompt suite.",
226+
help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
219227
)
220228
@click.option(
221229
"--output",
@@ -239,7 +247,7 @@ def sweep(
239247
def diff(
240248
model: str,
241249
backends: str,
242-
prompts: Path,
250+
prompts: str,
243251
output: Path,
244252
quant: str | None,
245253
base_urls: str | None,
@@ -262,7 +270,7 @@ def diff(
262270
f"[bold cyan]diff[/bold cyan] model={model} backends={backend_names} quant={quant}"
263271
)
264272

265-
prompt_list = load_suite(prompts)
273+
prompt_list = load_suite(_resolve_prompts(prompts))
266274

267275
backend_instances = []
268276
for name, url in zip(backend_names, url_list, strict=True):
@@ -332,8 +340,7 @@ def diff(
332340
@click.option(
333341
"--prompts",
334342
required=True,
335-
type=click.Path(exists=True, dir_okay=False, path_type=Path),
336-
help="Path to JSONL prompt suite.",
343+
help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
337344
)
338345
@click.option(
339346
"--output",
@@ -352,7 +359,7 @@ def diff(
352359
def stress(
353360
model: str,
354361
backend: str,
355-
prompts: Path,
362+
prompts: str,
356363
output: Path,
357364
concurrency: str,
358365
base_url: str | None,
@@ -369,7 +376,7 @@ def stress(
369376
f"concurrency={concurrency_levels}"
370377
)
371378

372-
prompt_list = load_suite(prompts)
379+
prompt_list = load_suite(_resolve_prompts(prompts))
373380

374381
config = BackendConfig(
375382
backend_type=backend, # type: ignore[arg-type]
@@ -424,8 +431,7 @@ def stress(
424431
@click.option(
425432
"--prompts",
426433
required=True,
427-
type=click.Path(exists=True, dir_okay=False, path_type=Path),
428-
help="Path to JSONL prompt suite.",
434+
help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
429435
)
430436
@click.option(
431437
"--output",
@@ -439,7 +445,7 @@ def stress(
439445
def determinism(
440446
model: str,
441447
backend: str,
442-
prompts: Path,
448+
prompts: str,
443449
output: Path,
444450
runs: int,
445451
base_url: str | None,
@@ -451,7 +457,7 @@ def determinism(
451457

452458
console.print(f"[bold cyan]determinism[/bold cyan] model={model} backend={backend} runs={runs}")
453459

454-
prompt_list = load_suite(prompts)
460+
prompt_list = load_suite(_resolve_prompts(prompts))
455461

456462
config = BackendConfig(
457463
backend_type=backend, # type: ignore[arg-type]
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""Bundled prompt suites for infer-check.
2+
3+
Use ``get_suite_path("reasoning")`` to get the path to a bundled suite,
4+
or ``list_suites()`` to see all available suites.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from importlib import resources
10+
from pathlib import Path
11+
12+
__all__ = ["get_suite_path", "list_suites"]
13+
14+
_PACKAGE = "infer_check.prompt_suites"
15+
16+
17+
def list_suites() -> list[str]:
18+
"""Return names of all bundled prompt suites (without .jsonl extension)."""
19+
suites = []
20+
for item in resources.files(_PACKAGE).iterdir():
21+
if hasattr(item, "name") and item.name.endswith(".jsonl"):
22+
suites.append(item.name.removesuffix(".jsonl"))
23+
return sorted(suites)
24+
25+
26+
def get_suite_path(name: str) -> Path:
27+
"""Resolve a suite name to a file path.
28+
29+
Accepts either:
30+
- A bare name like ``"reasoning"`` (resolves to the bundled suite)
31+
- An existing file path (returned as-is)
32+
"""
33+
# If it's already a path that exists, return it
34+
p = Path(name)
35+
if p.exists():
36+
return p
37+
38+
# Try as a bundled suite name
39+
clean = name.removesuffix(".jsonl")
40+
ref = resources.files(_PACKAGE) / f"{clean}.jsonl"
41+
# resources.as_file() gives us a real filesystem path
42+
if ref.is_file():
43+
return Path(str(ref))
44+
45+
available = list_suites()
46+
raise FileNotFoundError(
47+
f"Prompt suite '{name}' not found.\n"
48+
f"Available bundled suites: {', '.join(available)}\n"
49+
f"Or pass a path to a .jsonl file."
50+
)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{"text": "What is 0.1 + 0.2? Give a precise decimal answer.", "category": "floating_point", "max_tokens": 256}
2+
{"text": "Compute 9999999999999999 + 1.", "category": "large_numbers", "max_tokens": 256}
3+
{"text": "What is 10^308?", "category": "large_numbers", "max_tokens": 256}
4+
{"text": "What is 1/3 expressed as a decimal to 20 places?", "category": "precision", "max_tokens": 256}
5+
{"text": "Is 2^31 - 1 equal to 2147483647?", "category": "large_numbers", "max_tokens": 256}
6+
{"text": "What is the decimal representation of 1/7?", "category": "precision", "max_tokens": 256}
7+
{"text": "Compute: (-1)^(1/3). Is it -1 or a complex number?", "category": "edge_case", "max_tokens": 256}
8+
{"text": "What is 0^0?", "category": "edge_case", "max_tokens": 256}
9+
{"text": "What is the value of 1e-300 * 1e-300?", "category": "underflow", "max_tokens": 256}
10+
{"text": "Compute 170! (170 factorial). How many digits does it have?", "category": "large_numbers", "max_tokens": 256}
11+
{"text": "Write the number 1000000000000 in words.", "category": "formatting", "max_tokens": 256}
12+
{"text": "What is -0.0 equal to? Is -0.0 == 0.0 in IEEE 754?", "category": "edge_case", "max_tokens": 256}
13+
{"text": "Convert $1,234,567.89 to Japanese yen at a rate of 149.32 yen per dollar.", "category": "precision", "max_tokens": 256}
14+
{"text": "What is infinity minus infinity?", "category": "edge_case", "max_tokens": 256}
15+
{"text": "How many seconds are in a leap year? Show your calculation.", "category": "precision", "max_tokens": 256}
16+
{"text": "What is floor(-2.5)? What is round(-2.5)?", "category": "edge_case", "max_tokens": 256}
17+
{"text": "Express the speed of light (299792458 m/s) in miles per hour.", "category": "precision", "max_tokens": 256}
18+
{"text": "What is 2^53 + 1? Can this number be exactly represented as a 64-bit float?", "category": "floating_point", "max_tokens": 256}
19+
{"text": "Compute the sum: 1 + 1/2 + 1/4 + 1/8 + ... (infinite geometric series)", "category": "precision", "max_tokens": 256}
20+
{"text": "What is NaN == NaN in IEEE 754 floating point?", "category": "edge_case", "max_tokens": 256}
21+
{"text": "How many grains of sand are on Earth? Express in scientific notation.", "category": "large_numbers", "max_tokens": 256}
22+
{"text": "What is 7/13 * 13/7? Is the result exactly 1?", "category": "floating_point", "max_tokens": 256}
23+
{"text": "Convert the binary number 10110011.101 to decimal.", "category": "formatting", "max_tokens": 256}
24+
{"text": "What is 999...9 (fifty 9s) + 1?", "category": "large_numbers", "max_tokens": 256}
25+
{"text": "Compute 1/0. What happens? Now compute 1/0.0 in IEEE 754.", "category": "edge_case", "max_tokens": 256}
26+
{"text": "Express π to 30 decimal places.", "category": "precision", "max_tokens": 256}
27+
{"text": "What is MAX_INT in a 64-bit signed integer? What happens if you add 1 to it?", "category": "edge_case", "max_tokens": 256}
28+
{"text": "How many microseconds are in 3 years, 7 months, and 12 days?", "category": "precision", "max_tokens": 512}
29+
{"text": "What is 1/998001? This produces an interesting decimal pattern.", "category": "precision", "max_tokens": 512}
30+
{"text": "Is 0.9999999999 (ten 9s) equal to 1? What about 0.999... (infinitely repeating)?", "category": "floating_point", "max_tokens": 256}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{"text": "Write a Python function that checks if a string is a valid IPv4 address.", "category": "python", "max_tokens": 512}
2+
{"text": "Generate a valid JSON object representing a user profile with name, email, age, and a list of 3 hobbies.", "category": "json", "max_tokens": 256}
3+
{"text": "Write a Python function to find the longest common subsequence of two strings.", "category": "python", "max_tokens": 512}
4+
{"text": "Here is a buggy function:\ndef fib(n):\n if n <= 1:\n return 1\n return fib(n-1) + fib(n-2)\nWhat is wrong with it? Fix it so fib(0)=0, fib(1)=1.", "category": "debugging", "max_tokens": 512}
5+
{"text": "Write a Python class called BankAccount with deposit, withdraw, and balance methods. Withdrawals should fail if insufficient funds.", "category": "python", "max_tokens": 512}
6+
{"text": "Generate a JSON array of 5 objects, each with fields: id (integer), name (string), score (float between 0 and 1), tags (array of strings).", "category": "json", "max_tokens": 512}
7+
{"text": "Write a Python function that merges two sorted lists into one sorted list without using the built-in sort.", "category": "python", "max_tokens": 512}
8+
{"text": "Complete this Python code:\nimport re\ndef extract_emails(text):\n \"\"\"Return all email addresses found in text.\"\"\"", "category": "completion", "max_tokens": 256}
9+
{"text": "Write a SQL query that finds the second highest salary from an employees table.", "category": "sql", "max_tokens": 256}
10+
{"text": "Here is buggy code:\ndef binary_search(arr, target):\n low, high = 0, len(arr)\n while low < high:\n mid = (low + high) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n low = mid\n else:\n high = mid\n return -1\nThis has an infinite loop bug. Find and fix it.", "category": "debugging", "max_tokens": 512}
11+
{"text": "Write a Python generator function that yields prime numbers indefinitely.", "category": "python", "max_tokens": 512}
12+
{"text": "Generate a JSON Schema that validates an object with required fields: name (string, min 1 char), age (integer, min 0, max 150), email (string, format email).", "category": "json", "max_tokens": 512}
13+
{"text": "Write a Python decorator that caches function results (memoization) using a dictionary.", "category": "python", "max_tokens": 512}
14+
{"text": "Complete this code:\nclass Stack:\n def __init__(self):\n self._items = []\n \n def push(self, item):", "category": "completion", "max_tokens": 256}
15+
{"text": "Write a Python function that converts a Roman numeral string to an integer.", "category": "python", "max_tokens": 512}
16+
{"text": "Here is buggy code:\ndef flatten(lst):\n result = []\n for item in lst:\n if isinstance(item, list):\n result.extend(item)\n else:\n result.append(item)\n return result\nThis doesn't handle deeply nested lists. Fix it to work recursively.", "category": "debugging", "max_tokens": 512}
17+
{"text": "Write a Python async function that fetches 3 URLs concurrently using asyncio and aiohttp.", "category": "python", "max_tokens": 512}
18+
{"text": "Generate valid YAML for a Kubernetes deployment with 3 replicas of an nginx container on port 80.", "category": "yaml", "max_tokens": 512}
19+
{"text": "Write a Python function that implements the Levenshtein edit distance between two strings.", "category": "python", "max_tokens": 512}
20+
{"text": "Complete this code:\ndef parse_csv_line(line: str) -> list[str]:\n \"\"\"Parse a CSV line handling quoted fields with commas inside.\"\"\"", "category": "completion", "max_tokens": 512}
21+
{"text": "Write a Python context manager class that measures and prints execution time of a code block.", "category": "python", "max_tokens": 256}
22+
{"text": "Generate a JSON Web Token (JWT) payload with fields: sub, iat (unix timestamp for now), exp (1 hour from now), role, permissions array.", "category": "json", "max_tokens": 256}
23+
{"text": "Write a Python function that takes a nested dictionary and flattens it with dot-notation keys. Example: {'a': {'b': 1}} -> {'a.b': 1}", "category": "python", "max_tokens": 512}
24+
{"text": "Here is code with a subtle bug:\ndef unique_chars(s):\n return len(s) == len(set(s))\nDoes this work for all Unicode strings? What about combining characters?", "category": "debugging", "max_tokens": 512}
25+
{"text": "Write a Python function that validates whether a string of parentheses, brackets, and braces is balanced.", "category": "python", "max_tokens": 256}
26+
{"text": "Generate a JSON object representing an API error response following RFC 7807 (Problem Details).", "category": "json", "max_tokens": 256}
27+
{"text": "Write a Python function that implements run-length encoding. 'AAABBBCCCC' -> '3A3B4C'", "category": "python", "max_tokens": 256}
28+
{"text": "Write a Python type-annotated function that takes a list of dicts and groups them by a specified key.", "category": "python", "max_tokens": 512}
29+
{"text": "Complete this code to implement a simple LRU cache:\nclass LRUCache:\n def __init__(self, capacity: int):", "category": "completion", "max_tokens": 512}
30+
{"text": "Write a regular expression that matches valid email addresses. Explain each part.", "category": "python", "max_tokens": 512}
31+
{"text": "Here is buggy Python:\ndef quicksort(arr):\n if len(arr) <= 1:\n return arr\n pivot = arr[0]\n left = [x for x in arr if x < pivot]\n right = [x for x in arr if x > pivot]\n return quicksort(left) + [pivot] + quicksort(right)\nWhat happens with duplicate elements? Fix it.", "category": "debugging", "max_tokens": 512}
32+
{"text": "Write a Python function that converts a Python dictionary to a valid GraphQL query string.", "category": "python", "max_tokens": 512}
33+
{"text": "Generate a JSON array where each element is a date string in ISO 8601 format for every Monday in March 2026.", "category": "json", "max_tokens": 256}
34+
{"text": "Write a Python function that reads a file and returns the 10 most frequent words with their counts.", "category": "python", "max_tokens": 512}
35+
{"text": "Complete this async generator:\nasync def read_chunks(stream, chunk_size=1024):\n \"\"\"Yield chunks from an async byte stream.\"\"\"", "category": "completion", "max_tokens": 256}
36+
{"text": "Write a Python dataclass for a 2D Point with distance_to, midpoint, and __add__ methods.", "category": "python", "max_tokens": 512}
37+
{"text": "Write a one-liner Python list comprehension that generates all Pythagorean triples where a,b,c < 100.", "category": "python", "max_tokens": 256}
38+
{"text": "Generate a minimal valid HTML5 document with a title, a heading, a paragraph, and a link.", "category": "html", "max_tokens": 256}
39+
{"text": "Write a Python function that converts a flat list of parent-child pairs into a tree structure.", "category": "python", "max_tokens": 512}
40+
{"text": "Here is buggy code:\ndef safe_divide(a, b):\n try:\n return a / b\n except:\n return 0\nWhat are the problems with this error handling? Rewrite it properly.", "category": "debugging", "max_tokens": 512}
41+
{"text": "Write a Python function that implements the Sieve of Eratosthenes up to n.", "category": "python", "max_tokens": 256}
42+
{"text": "Generate a JSON object with intentionally tricky values: empty string, null, false, 0, empty array, nested nulls.", "category": "json", "max_tokens": 256}
43+
{"text": "Write a Dockerfile for a Python 3.11 FastAPI app that listens on port 8000.", "category": "devops", "max_tokens": 256}
44+
{"text": "Write a Python function that takes markdown text and extracts all URLs from it.", "category": "python", "max_tokens": 256}
45+
{"text": "Complete this:\ndef retry(max_attempts=3, delay=1.0):\n \"\"\"Decorator that retries a function on exception.\"\"\"", "category": "completion", "max_tokens": 512}
46+
{"text": "Write a Python function that serializes a datetime object to ISO 8601 string and deserializes it back, handling timezone-aware and naive datetimes.", "category": "python", "max_tokens": 512}
47+
{"text": "What does this code print and why?\nx = [1, 2, 3]\ny = x\ny.append(4)\nprint(x)", "category": "debugging", "max_tokens": 256}
48+
{"text": "Write a Python function that implements consistent hashing for distributing keys across N nodes.", "category": "python", "max_tokens": 512}
49+
{"text": "Generate a JSON-LD object representing a Person with name, job title, and employer according to schema.org.", "category": "json", "max_tokens": 256}

0 commit comments

Comments
 (0)