NullPointerDepressiveDisorder
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 7 additions & 5 deletions b/‎README.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/infer_check/cli.py‎
Lines changed: 22 additions & 16 deletions b/‎src/infer_check/cli.py‎
Lines changed: 22 additions & 16 deletions
diff --git a/‎src/infer_check/prompt_suites/__init__.py‎
Lines changed: 50 additions & 0 deletions b/‎src/infer_check/prompt_suites/__init__.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/infer_check/prompt_suites/adversarial-numerics.jsonl‎
Lines changed: 30 additions & 0 deletions b/‎src/infer_check/prompt_suites/adversarial-numerics.jsonl‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/infer_check/prompt_suites/code.jsonl‎
Lines changed: 49 additions & 0 deletions b/‎src/infer_check/prompt_suites/code.jsonl‎
Lines changed: 49 additions & 0 deletions
@@ -25,7 +25,7 @@ uv.lock
 htmlcov/
 .test_cache/
 
-# infer-check results (large, generated)
+# infer-check results and caches (generated)
 results/
 *.report.html
 *.report.json
 
@@ -50,10 +50,12 @@ infer-check sweep \
             8bit=mlx-community/Meta-Llama-3.1-8B-Instruct-8bit,\
             4bit=mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" \
   --backend mlx-lm \
-  --prompts ./prompt-suites/reasoning.jsonl \
+  --prompts reasoning \
   --output ./results/sweep/
 ```
 
+`--prompts` accepts either a bundled suite name (`reasoning`, `code`, `adversarial-numerics`, `determinism`, `long-context`) or a path to any `.jsonl` file.
+
 The baseline is automatically run twice as a self-check — if it's not 50/50 identical, your comparison data is unreliable.
 
 ```
@@ -79,7 +81,7 @@ infer-check diff \
   --model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
   --backends "mlx-lm,openai-compat" \
   --base-urls ",http://localhost:8000" \
-  --prompts ./prompt-suites/reasoning.jsonl \
+  --prompts reasoning \
   --output ./results/diff/
 ```
 
@@ -93,7 +95,7 @@ Same prompt N times at temperature=0. Output should be bit-identical every run.
 infer-check determinism \
   --model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
   --backend mlx-lm \
-  --prompts ./prompt-suites/determinism.jsonl \
+  --prompts determinism \
   --runs 20 \
   --output ./results/determinism/
 ```
@@ -107,7 +109,7 @@ infer-check stress \
   --model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
   --backend openai-compat \
   --base-url http://localhost:8000 \
-  --prompts ./prompt-suites/reasoning.jsonl \
+  --prompts reasoning \
   --concurrency 1,2,4,8 \
   --output ./results/stress/
 ```
@@ -132,7 +134,7 @@ Curated prompts targeting known quantization failure modes:
 | `long-context.jsonl` | 10 | Tables and transcripts with recall questions |
 | `determinism.jsonl` | 50 | High-entropy continuations for determinism testing |
 
-Custom suites are JSONL files: `{"id": "...", "text": "...", "category": "...", "max_tokens": N}` per line.
+All suites ship with the package — no need to clone the repo. Custom suites are JSONL files: `{"id": "...", "text": "...", "category": "...", "max_tokens": N}` per line.
 
 ## Supported backends
 
 
@@ -14,6 +14,16 @@
 console = Console()
 
 
+def _resolve_prompts(prompts: str) -> Path:
+    """Resolve a prompt suite name or path to an actual file path."""
+    from infer_check.prompt_suites import get_suite_path
+
+    try:
+        return get_suite_path(prompts)
+    except FileNotFoundError as exc:
+        raise click.BadParameter(str(exc)) from exc
+
+
 @click.group()
 @click.version_option(package_name="infer-check")
 def main() -> None:
@@ -39,8 +49,7 @@ def main() -> None:
 @click.option(
     "--prompts",
     required=True,
-    type=click.Path(exists=True, dir_okay=False, path_type=Path),
-    help="Path to JSONL prompt suite.",
+    help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
 )
 @click.option(
     "--output",
@@ -58,7 +67,7 @@ def main() -> None:
 def sweep(
     models: str,
     backend: str,
-    prompts: Path,
+    prompts: str,
     output: Path,
     baseline: str | None,
     base_url: str | None,
@@ -112,7 +121,7 @@ def sweep(
         tag = " (baseline)" if label == baseline_label else ""
         console.print(f"  {label}: {path}{tag}")
 
-    prompt_list = load_suite(prompts)
+    prompt_list = load_suite(_resolve_prompts(prompts))
 
     # Build a separate backend for each model
     backend_map: dict[str, Any] = {}
@@ -214,8 +223,7 @@ def sweep(
 @click.option(
     "--prompts",
     required=True,
-    type=click.Path(exists=True, dir_okay=False, path_type=Path),
-    help="Path to JSONL prompt suite.",
+    help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
 )
 @click.option(
     "--output",
@@ -239,7 +247,7 @@ def sweep(
 def diff(
     model: str,
     backends: str,
-    prompts: Path,
+    prompts: str,
     output: Path,
     quant: str | None,
     base_urls: str | None,
@@ -262,7 +270,7 @@ def diff(
         f"[bold cyan]diff[/bold cyan] model={model} backends={backend_names} quant={quant}"
     )
 
-    prompt_list = load_suite(prompts)
+    prompt_list = load_suite(_resolve_prompts(prompts))
 
     backend_instances = []
     for name, url in zip(backend_names, url_list, strict=True):
@@ -332,8 +340,7 @@ def diff(
 @click.option(
     "--prompts",
     required=True,
-    type=click.Path(exists=True, dir_okay=False, path_type=Path),
-    help="Path to JSONL prompt suite.",
+    help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
 )
 @click.option(
     "--output",
@@ -352,7 +359,7 @@ def diff(
 def stress(
     model: str,
     backend: str,
-    prompts: Path,
+    prompts: str,
     output: Path,
     concurrency: str,
     base_url: str | None,
@@ -369,7 +376,7 @@ def stress(
         f"concurrency={concurrency_levels}"
     )
 
-    prompt_list = load_suite(prompts)
+    prompt_list = load_suite(_resolve_prompts(prompts))
 
     config = BackendConfig(
         backend_type=backend,  # type: ignore[arg-type]
@@ -424,8 +431,7 @@ def stress(
 @click.option(
     "--prompts",
     required=True,
-    type=click.Path(exists=True, dir_okay=False, path_type=Path),
-    help="Path to JSONL prompt suite.",
+    help="Bundled suite name (e.g. 'reasoning') or path to a .jsonl file.",
 )
 @click.option(
     "--output",
@@ -439,7 +445,7 @@ def stress(
 def determinism(
     model: str,
     backend: str,
-    prompts: Path,
+    prompts: str,
     output: Path,
     runs: int,
     base_url: str | None,
@@ -451,7 +457,7 @@ def determinism(
 
     console.print(f"[bold cyan]determinism[/bold cyan] model={model} backend={backend} runs={runs}")
 
-    prompt_list = load_suite(prompts)
+    prompt_list = load_suite(_resolve_prompts(prompts))
 
     config = BackendConfig(
         backend_type=backend,  # type: ignore[arg-type]
 
@@ -0,0 +1,50 @@
+"""Bundled prompt suites for infer-check.
+
+Use ``get_suite_path("reasoning")`` to get the path to a bundled suite,
+or ``list_suites()`` to see all available suites.
+"""
+
+from __future__ import annotations
+
+from importlib import resources
+from pathlib import Path
+
+__all__ = ["get_suite_path", "list_suites"]
+
+_PACKAGE = "infer_check.prompt_suites"
+
+
+def list_suites() -> list[str]:
+    """Return names of all bundled prompt suites (without .jsonl extension)."""
+    suites = []
+    for item in resources.files(_PACKAGE).iterdir():
+        if hasattr(item, "name") and item.name.endswith(".jsonl"):
+            suites.append(item.name.removesuffix(".jsonl"))
+    return sorted(suites)
+
+
+def get_suite_path(name: str) -> Path:
+    """Resolve a suite name to a file path.
+
+    Accepts either:
+      - A bare name like ``"reasoning"`` (resolves to the bundled suite)
+      - An existing file path (returned as-is)
+    """
+    # If it's already a path that exists, return it
+    p = Path(name)
+    if p.exists():
+        return p
+
+    # Try as a bundled suite name
+    clean = name.removesuffix(".jsonl")
+    ref = resources.files(_PACKAGE) / f"{clean}.jsonl"
+    # resources.as_file() gives us a real filesystem path
+    if ref.is_file():
+        return Path(str(ref))
+
+    available = list_suites()
+    raise FileNotFoundError(
+        f"Prompt suite '{name}' not found.\n"
+        f"Available bundled suites: {', '.join(available)}\n"
+        f"Or pass a path to a .jsonl file."
+    )
@@ -0,0 +1,30 @@
+{"text": "What is 0.1 + 0.2? Give a precise decimal answer.", "category": "floating_point", "max_tokens": 256}
+{"text": "Compute 9999999999999999 + 1.", "category": "large_numbers", "max_tokens": 256}
+{"text": "What is 10^308?", "category": "large_numbers", "max_tokens": 256}
+{"text": "What is 1/3 expressed as a decimal to 20 places?", "category": "precision", "max_tokens": 256}
+{"text": "Is 2^31 - 1 equal to 2147483647?", "category": "large_numbers", "max_tokens": 256}
+{"text": "What is the decimal representation of 1/7?", "category": "precision", "max_tokens": 256}
+{"text": "Compute: (-1)^(1/3). Is it -1 or a complex number?", "category": "edge_case", "max_tokens": 256}
+{"text": "What is 0^0?", "category": "edge_case", "max_tokens": 256}
+{"text": "What is the value of 1e-300 * 1e-300?", "category": "underflow", "max_tokens": 256}
+{"text": "Compute 170! (170 factorial). How many digits does it have?", "category": "large_numbers", "max_tokens": 256}
+{"text": "Write the number 1000000000000 in words.", "category": "formatting", "max_tokens": 256}
+{"text": "What is -0.0 equal to? Is -0.0 == 0.0 in IEEE 754?", "category": "edge_case", "max_tokens": 256}
+{"text": "Convert $1,234,567.89 to Japanese yen at a rate of 149.32 yen per dollar.", "category": "precision", "max_tokens": 256}
+{"text": "What is infinity minus infinity?", "category": "edge_case", "max_tokens": 256}
+{"text": "How many seconds are in a leap year? Show your calculation.", "category": "precision", "max_tokens": 256}
+{"text": "What is floor(-2.5)? What is round(-2.5)?", "category": "edge_case", "max_tokens": 256}
+{"text": "Express the speed of light (299792458 m/s) in miles per hour.", "category": "precision", "max_tokens": 256}
+{"text": "What is 2^53 + 1? Can this number be exactly represented as a 64-bit float?", "category": "floating_point", "max_tokens": 256}
+{"text": "Compute the sum: 1 + 1/2 + 1/4 + 1/8 + ... (infinite geometric series)", "category": "precision", "max_tokens": 256}
+{"text": "What is NaN == NaN in IEEE 754 floating point?", "category": "edge_case", "max_tokens": 256}
+{"text": "How many grains of sand are on Earth? Express in scientific notation.", "category": "large_numbers", "max_tokens": 256}
+{"text": "What is 7/13 * 13/7? Is the result exactly 1?", "category": "floating_point", "max_tokens": 256}
+{"text": "Convert the binary number 10110011.101 to decimal.", "category": "formatting", "max_tokens": 256}
+{"text": "What is 999...9 (fifty 9s) + 1?", "category": "large_numbers", "max_tokens": 256}
+{"text": "Compute 1/0. What happens? Now compute 1/0.0 in IEEE 754.", "category": "edge_case", "max_tokens": 256}
+{"text": "Express π to 30 decimal places.", "category": "precision", "max_tokens": 256}
+{"text": "What is MAX_INT in a 64-bit signed integer? What happens if you add 1 to it?", "category": "edge_case", "max_tokens": 256}
+{"text": "How many microseconds are in 3 years, 7 months, and 12 days?", "category": "precision", "max_tokens": 512}
+{"text": "What is 1/998001? This produces an interesting decimal pattern.", "category": "precision", "max_tokens": 512}
+{"text": "Is 0.9999999999 (ten 9s) equal to 1? What about 0.999... (infinitely repeating)?", "category": "floating_point", "max_tokens": 256}
@@ -0,0 +1,49 @@
+{"text": "Write a Python function that checks if a string is a valid IPv4 address.", "category": "python", "max_tokens": 512}
+{"text": "Generate a valid JSON object representing a user profile with name, email, age, and a list of 3 hobbies.", "category": "json", "max_tokens": 256}
+{"text": "Write a Python function to find the longest common subsequence of two strings.", "category": "python", "max_tokens": 512}
+{"text": "Here is a buggy function:\ndef fib(n):\n    if n <= 1:\n        return 1\n    return fib(n-1) + fib(n-2)\nWhat is wrong with it? Fix it so fib(0)=0, fib(1)=1.", "category": "debugging", "max_tokens": 512}
+{"text": "Write a Python class called BankAccount with deposit, withdraw, and balance methods. Withdrawals should fail if insufficient funds.", "category": "python", "max_tokens": 512}
+{"text": "Generate a JSON array of 5 objects, each with fields: id (integer), name (string), score (float between 0 and 1), tags (array of strings).", "category": "json", "max_tokens": 512}
+{"text": "Write a Python function that merges two sorted lists into one sorted list without using the built-in sort.", "category": "python", "max_tokens": 512}
+{"text": "Complete this Python code:\nimport re\ndef extract_emails(text):\n    \"\"\"Return all email addresses found in text.\"\"\"", "category": "completion", "max_tokens": 256}
+{"text": "Write a SQL query that finds the second highest salary from an employees table.", "category": "sql", "max_tokens": 256}
+{"text": "Here is buggy code:\ndef binary_search(arr, target):\n    low, high = 0, len(arr)\n    while low < high:\n        mid = (low + high) // 2\n        if arr[mid] == target:\n            return mid\n        elif arr[mid] < target:\n            low = mid\n        else:\n            high = mid\n    return -1\nThis has an infinite loop bug. Find and fix it.", "category": "debugging", "max_tokens": 512}
+{"text": "Write a Python generator function that yields prime numbers indefinitely.", "category": "python", "max_tokens": 512}
+{"text": "Generate a JSON Schema that validates an object with required fields: name (string, min 1 char), age (integer, min 0, max 150), email (string, format email).", "category": "json", "max_tokens": 512}
+{"text": "Write a Python decorator that caches function results (memoization) using a dictionary.", "category": "python", "max_tokens": 512}
+{"text": "Complete this code:\nclass Stack:\n    def __init__(self):\n        self._items = []\n    \n    def push(self, item):", "category": "completion", "max_tokens": 256}
+{"text": "Write a Python function that converts a Roman numeral string to an integer.", "category": "python", "max_tokens": 512}
+{"text": "Here is buggy code:\ndef flatten(lst):\n    result = []\n    for item in lst:\n        if isinstance(item, list):\n            result.extend(item)\n        else:\n            result.append(item)\n    return result\nThis doesn't handle deeply nested lists. Fix it to work recursively.", "category": "debugging", "max_tokens": 512}
+{"text": "Write a Python async function that fetches 3 URLs concurrently using asyncio and aiohttp.", "category": "python", "max_tokens": 512}
+{"text": "Generate valid YAML for a Kubernetes deployment with 3 replicas of an nginx container on port 80.", "category": "yaml", "max_tokens": 512}
+{"text": "Write a Python function that implements the Levenshtein edit distance between two strings.", "category": "python", "max_tokens": 512}
+{"text": "Complete this code:\ndef parse_csv_line(line: str) -> list[str]:\n    \"\"\"Parse a CSV line handling quoted fields with commas inside.\"\"\"", "category": "completion", "max_tokens": 512}
+{"text": "Write a Python context manager class that measures and prints execution time of a code block.", "category": "python", "max_tokens": 256}
+{"text": "Generate a JSON Web Token (JWT) payload with fields: sub, iat (unix timestamp for now), exp (1 hour from now), role, permissions array.", "category": "json", "max_tokens": 256}
+{"text": "Write a Python function that takes a nested dictionary and flattens it with dot-notation keys. Example: {'a': {'b': 1}} -> {'a.b': 1}", "category": "python", "max_tokens": 512}
+{"text": "Here is code with a subtle bug:\ndef unique_chars(s):\n    return len(s) == len(set(s))\nDoes this work for all Unicode strings? What about combining characters?", "category": "debugging", "max_tokens": 512}
+{"text": "Write a Python function that validates whether a string of parentheses, brackets, and braces is balanced.", "category": "python", "max_tokens": 256}
+{"text": "Generate a JSON object representing an API error response following RFC 7807 (Problem Details).", "category": "json", "max_tokens": 256}
+{"text": "Write a Python function that implements run-length encoding. 'AAABBBCCCC' -> '3A3B4C'", "category": "python", "max_tokens": 256}
+{"text": "Write a Python type-annotated function that takes a list of dicts and groups them by a specified key.", "category": "python", "max_tokens": 512}
+{"text": "Complete this code to implement a simple LRU cache:\nclass LRUCache:\n    def __init__(self, capacity: int):", "category": "completion", "max_tokens": 512}
+{"text": "Write a regular expression that matches valid email addresses. Explain each part.", "category": "python", "max_tokens": 512}
+{"text": "Here is buggy Python:\ndef quicksort(arr):\n    if len(arr) <= 1:\n        return arr\n    pivot = arr[0]\n    left = [x for x in arr if x < pivot]\n    right = [x for x in arr if x > pivot]\n    return quicksort(left) + [pivot] + quicksort(right)\nWhat happens with duplicate elements? Fix it.", "category": "debugging", "max_tokens": 512}
+{"text": "Write a Python function that converts a Python dictionary to a valid GraphQL query string.", "category": "python", "max_tokens": 512}
+{"text": "Generate a JSON array where each element is a date string in ISO 8601 format for every Monday in March 2026.", "category": "json", "max_tokens": 256}
+{"text": "Write a Python function that reads a file and returns the 10 most frequent words with their counts.", "category": "python", "max_tokens": 512}
+{"text": "Complete this async generator:\nasync def read_chunks(stream, chunk_size=1024):\n    \"\"\"Yield chunks from an async byte stream.\"\"\"", "category": "completion", "max_tokens": 256}
+{"text": "Write a Python dataclass for a 2D Point with distance_to, midpoint, and __add__ methods.", "category": "python", "max_tokens": 512}
+{"text": "Write a one-liner Python list comprehension that generates all Pythagorean triples where a,b,c < 100.", "category": "python", "max_tokens": 256}
+{"text": "Generate a minimal valid HTML5 document with a title, a heading, a paragraph, and a link.", "category": "html", "max_tokens": 256}
+{"text": "Write a Python function that converts a flat list of parent-child pairs into a tree structure.", "category": "python", "max_tokens": 512}
+{"text": "Here is buggy code:\ndef safe_divide(a, b):\n    try:\n        return a / b\n    except:\n        return 0\nWhat are the problems with this error handling? Rewrite it properly.", "category": "debugging", "max_tokens": 512}
+{"text": "Write a Python function that implements the Sieve of Eratosthenes up to n.", "category": "python", "max_tokens": 256}
+{"text": "Generate a JSON object with intentionally tricky values: empty string, null, false, 0, empty array, nested nulls.", "category": "json", "max_tokens": 256}
+{"text": "Write a Dockerfile for a Python 3.11 FastAPI app that listens on port 8000.", "category": "devops", "max_tokens": 256}
+{"text": "Write a Python function that takes markdown text and extracts all URLs from it.", "category": "python", "max_tokens": 256}
+{"text": "Complete this:\ndef retry(max_attempts=3, delay=1.0):\n    \"\"\"Decorator that retries a function on exception.\"\"\"", "category": "completion", "max_tokens": 512}
+{"text": "Write a Python function that serializes a datetime object to ISO 8601 string and deserializes it back, handling timezone-aware and naive datetimes.", "category": "python", "max_tokens": 512}
+{"text": "What does this code print and why?\nx = [1, 2, 3]\ny = x\ny.append(4)\nprint(x)", "category": "debugging", "max_tokens": 256}
+{"text": "Write a Python function that implements consistent hashing for distributing keys across N nodes.", "category": "python", "max_tokens": 512}
+{"text": "Generate a JSON-LD object representing a Person with name, job title, and employer according to schema.org.", "category": "json", "max_tokens": 256}