GrayCodeAI
diff --git a/‎.claude-plugin/plugin.json‎
Lines changed: 34 additions & 0 deletions b/‎.claude-plugin/plugin.json‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎.claude/scheduled_tasks.lock‎
Lines changed: 1 addition & 0 deletions b/‎.claude/scheduled_tasks.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎.github/workflows/quality.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/quality.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎evals/llm_run.py‎
Lines changed: 105 additions & 0 deletions b/‎evals/llm_run.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎evals/plot.py‎
Lines changed: 150 additions & 0 deletions b/‎evals/plot.py‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎evals/prompts/en.txt‎
Lines changed: 10 additions & 0 deletions b/‎evals/prompts/en.txt‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎filters/ansible.toml‎
Lines changed: 34 additions & 0 deletions b/‎filters/ansible.toml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎filters/basedpyright.toml‎
Lines changed: 15 additions & 0 deletions b/‎filters/basedpyright.toml‎
Lines changed: 15 additions & 0 deletions
@@ -0,0 +1,34 @@
+{
+  "name": "tok",
+  "description": "Unified token-reduction CLI: transparent command output filtering, six compression modes (lite/full/ultra/wenyan-*), SQLite-backed savings analytics, and 69+ built-in filters.",
+  "author": {
+    "name": "Lakshman Patel",
+    "url": "https://github.com/lakshmanpatel/tok"
+  },
+  "hooks": {
+    "SessionStart": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "node \"${CLAUDE_PLUGIN_ROOT}/hooks/tok-mode-activate.js\"",
+            "timeout": 5,
+            "statusMessage": "Loading tok mode..."
+          }
+        ]
+      }
+    ],
+    "UserPromptSubmit": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "node \"${CLAUDE_PLUGIN_ROOT}/hooks/tok-mode-tracker.js\"",
+            "timeout": 5,
+            "statusMessage": "Tracking tok mode..."
+          }
+        ]
+      }
+    ]
+  }
+}
@@ -0,0 +1 @@
+{"sessionId":"c62a742d-ec0a-47a3-887a-bbdbe70b0024","pid":1688229,"acquiredAt":1776659743194}
@@ -63,9 +63,10 @@ jobs:
 
     - name: Check formatting
       run: |
-        if [ "$(gofmt -s -l . | wc -l)" -gt 0 ]; then
-          echo "Please run 'gofmt -w .' to format the following files:"
-          gofmt -s -l .
+        bad=$(gofmt -s -l . 2>/dev/null | grep -v -E '^(\.gomodcache|\.gocache|\.gosrccache|vendor)/' || true)
+        if [ -n "$bad" ]; then
+          echo "Please run 'gofmt -s -w .' to format the following files:"
+          echo "$bad"
           exit 1
         fi
     
 
@@ -95,7 +95,11 @@ jobs:
     - name: Run golangci-lint
       uses: golangci/golangci-lint-action@v6
       with:
-        version: latest
+        # Pin to a version built with Go ≥ our module's go-directive (1.25).
+        # "latest" currently resolves to a v1.x built with Go 1.24 which
+        # errors with "Go language version used to build golangci-lint is
+        # lower than the targeted Go version".
+        version: v1.64.8
         args: --timeout=5m
 
     - name: Check go mod tidy
 
@@ -0,0 +1,105 @@
+"""
+Run each prompt through Claude Code in three conditions and snapshot the
+real LLM outputs:
+
+  1. baseline      — no extra system prompt at all
+  2. terse         — system prompt: "Answer concisely."
+  3. terse+skill   — system prompt: "Answer concisely.\n\n{SKILL.md}"
+
+The honest delta is (3) vs (2): how much does the SKILL itself add on top
+of a plain "be terse" instruction? Comparing (3) vs (1) conflates the
+skill with the generic terseness ask, which is what the previous version
+of this harness did.
+
+This is the source-of-truth generator. It calls a real LLM and produces
+evals/snapshots/results.json. Run it locally when SKILL.md files change.
+The CI-side `measure.py` only reads the snapshot and counts tokens.
+
+Requires:
+  - `claude` CLI on PATH (Claude Code), authenticated
+
+Run: uv run python evals/llm_run.py
+
+Environment:
+  TOK_EVAL_MODEL  optional --model flag value passed through to claude
+"""
+
+from __future__ import annotations
+
+import datetime as dt
+import json
+import os
+import subprocess
+from pathlib import Path
+
+EVALS = Path(__file__).parent
+SKILLS = EVALS.parent / "rules"
+PROMPTS = EVALS / "prompts" / "en.txt"
+SNAPSHOT = EVALS / "snapshots" / "results.json"
+
+TERSE_PREFIX = "Answer concisely."
+
+
+def run_claude(prompt: str, system: str | None = None) -> str:
+    cmd = ["claude", "-p"]
+    if system:
+        cmd += ["--system-prompt", system]
+    if model := os.environ.get("TOK_EVAL_MODEL"):
+        cmd += ["--model", model]
+    cmd.append(prompt)
+    out = subprocess.run(cmd, capture_output=True, text=True, check=True)
+    return out.stdout.strip()
+
+
+def claude_version() -> str:
+    try:
+        out = subprocess.run(
+            ["claude", "--version"], capture_output=True, text=True, check=True
+        )
+        return out.stdout.strip()
+    except Exception:
+        return "unknown"
+
+
+def main() -> None:
+    prompts = [p.strip() for p in PROMPTS.read_text().splitlines() if p.strip()]
+    skills = sorted(p.name for p in SKILLS.iterdir() if (p / "SKILL.md").exists())
+
+    print(
+        f"=== {len(prompts)} prompts × ({len(skills)} skills + 2 control arms) ===",
+        flush=True,
+    )
+
+    snapshot: dict = {
+        "metadata": {
+            "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+            "claude_cli_version": claude_version(),
+            "model": os.environ.get("TOK_EVAL_MODEL", "default"),
+            "n_prompts": len(prompts),
+            "terse_prefix": TERSE_PREFIX,
+        },
+        "prompts": prompts,
+        "arms": {},
+    }
+
+    print("baseline (no system prompt)", flush=True)
+    snapshot["arms"]["__baseline__"] = [run_claude(p) for p in prompts]
+
+    print("terse (control: terse instruction only, no skill)", flush=True)
+    snapshot["arms"]["__terse__"] = [
+        run_claude(p, system=TERSE_PREFIX) for p in prompts
+    ]
+
+    for skill in skills:
+        skill_md = (SKILLS / skill / "SKILL.md").read_text()
+        system = f"{TERSE_PREFIX}\n\n{skill_md}"
+        print(f"  {skill}", flush=True)
+        snapshot["arms"][skill] = [run_claude(p, system=system) for p in prompts]
+
+    SNAPSHOT.parent.mkdir(parents=True, exist_ok=True)
+    SNAPSHOT.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2))
+    print(f"\nWrote {SNAPSHOT}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,150 @@
+"""
+Generate a boxplot showing the distribution of token compression per
+skill, compared against a plain "Answer concisely." control.
+
+Reads evals/snapshots/results.json and writes:
+  - evals/snapshots/results.html  (interactive plotly)
+  - evals/snapshots/results.png   (static export for README/PR embed)
+
+Run: uv run --with tiktoken --with plotly --with kaleido python evals/plot.py
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from pathlib import Path
+
+import plotly.graph_objects as go
+import tiktoken
+
+ENCODING = tiktoken.get_encoding("o200k_base")
+SNAPSHOT = Path(__file__).parent / "snapshots" / "results.json"
+HTML_OUT = Path(__file__).parent / "snapshots" / "results.html"
+PNG_OUT = Path(__file__).parent / "snapshots" / "results.png"
+
+
+def count(text: str) -> int:
+    return len(ENCODING.encode(text))
+
+
+def main() -> None:
+    data = json.loads(SNAPSHOT.read_text())
+    arms = data["arms"]
+    meta = data.get("metadata", {})
+
+    terse_tokens = [count(o) for o in arms["__terse__"]]
+
+    rows = []
+    for skill, outputs in arms.items():
+        if skill in ("__baseline__", "__terse__"):
+            continue
+        skill_tokens = [count(o) for o in outputs]
+        savings = [
+            (1 - (s / t)) * 100 if t else 0.0
+            for s, t in zip(skill_tokens, terse_tokens)
+        ]
+        rows.append(
+            {"skill": skill, "savings": savings, "median": statistics.median(savings)}
+        )
+
+    rows.sort(key=lambda r: -r["median"])  # best first
+
+    fig = go.Figure()
+
+    for row in rows:
+        fig.add_trace(
+            go.Box(
+                y=row["savings"],
+                name=row["skill"],
+                boxpoints="all",
+                jitter=0.4,
+                pointpos=0,
+                marker=dict(color="#2ca02c", size=7, opacity=0.7),
+                line=dict(color="#2c3e50", width=2),
+                fillcolor="rgba(76, 120, 168, 0.25)",
+                boxmean=True,
+                hovertemplate="<b>%{x}</b><br>%{y:.1f}%<extra></extra>",
+            )
+        )
+
+    # zero line — "no effect"
+    fig.add_hline(
+        y=0,
+        line=dict(color="black", width=1.5, dash="dash"),
+        annotation_text="no effect (= same length as control)",
+        annotation_position="top right",
+        annotation_font=dict(size=11, color="black"),
+    )
+
+    # median labels above each box
+    for row in rows:
+        fig.add_annotation(
+            x=row["skill"],
+            y=max(row["savings"]),
+            text=f"<b>{row['median']:+.0f}%</b>",
+            showarrow=False,
+            yshift=22,
+            font=dict(size=16, color="#2c3e50"),
+        )
+
+    fig.update_layout(
+        title=dict(
+            text=f"<b>How much shorter does each skill make Claude's answers?</b><br>"
+            f"<sub>Distribution of per-prompt savings vs system prompt = "
+            f"<i>'Answer concisely.'</i><br>"
+            f"{meta.get('model', '?')} · n={meta.get('n_prompts', '?')} prompts · "
+            f"single run per arm</sub>",
+            x=0.5,
+            xanchor="center",
+        ),
+        xaxis=dict(title="", automargin=True),
+        yaxis=dict(
+            title="↑ shorter  ·  vs control  ·  longer ↓",
+            ticksuffix="%",
+            zeroline=False,
+            gridcolor="rgba(0,0,0,0.08)",
+            range=[-30, 115],
+        ),
+        plot_bgcolor="white",
+        height=560,
+        width=980,
+        margin=dict(l=140, r=80, t=120, b=120),
+        showlegend=False,
+        annotations=[
+            dict(
+                x=0.5,
+                y=-0.22,
+                xref="paper",
+                yref="paper",
+                showarrow=False,
+                font=dict(size=11, color="#555"),
+                text=(
+                    "<b>box</b> = IQR (middle 50%) · "
+                    "<b>line in box</b> = median · "
+                    "<b>dashed line</b> = mean · "
+                    "<b>green dots</b> = individual prompts"
+                ),
+            )
+        ],
+    )
+
+    # re-add labels after update_layout (which would otherwise wipe them)
+    for row in rows:
+        fig.add_annotation(
+            x=row["skill"],
+            y=max(row["savings"]),
+            text=f"<b>{row['median']:+.0f}%</b>",
+            showarrow=False,
+            yshift=22,
+            font=dict(size=16, color="#2c3e50"),
+        )
+
+    fig.write_html(HTML_OUT)
+    print(f"Wrote {HTML_OUT}")
+    fig.write_image(PNG_OUT, scale=2)
+    print(f"Wrote {PNG_OUT}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,10 @@
+Why does my React component re-render every time the parent updates?
+Explain database connection pooling.
+What's the difference between TCP and UDP?
+How do I fix a memory leak in a long-running Node.js process?
+What does the SQL EXPLAIN command tell me?
+How does a hash table handle collisions?
+Why am I getting CORS errors in my browser console?
+What's the point of using a debouncer on a search input?
+How does git rebase differ from git merge?
+When should I use a queue vs a topic in messaging systems?
@@ -0,0 +1,34 @@
+# Ansible playbook filter
+schema_version = 1
+
+[ansible_playbook]
+match_command = "^ansible-playbook\\b"
+strip_ansi = true
+strip_lines_matching = [
+    "^\\s*$",
+    "^ok: \\[",
+    "^skipping: \\[",
+    "^\\s*Gathering Facts",
+]
+keep_lines_matching = [
+    "^PLAY \\[",
+    "^TASK \\[",
+    "^changed: \\[",
+    "^failed: \\[",
+    "^fatal: \\[",
+    "^unreachable: \\[",
+    "^PLAY RECAP",
+    "^\\S+\\s+:\\s+ok=\\d+",
+    "Error:.*",
+]
+max_lines = 80
+on_empty = "Playbook completed, no changes"
+
+[ansible_inventory]
+match_command = "^ansible-inventory\\b"
+strip_ansi = true
+keep_lines_matching = [
+    "^\\[.*\\]",
+    "^\\S+$",
+]
+max_lines = 40
@@ -0,0 +1,15 @@
+# basedpyright filter
+schema_version = 1
+
+[basedpyright]
+match_command = "^basedpyright\\b"
+strip_ansi = true
+strip_lines_matching = [
+  "^\\s*$",
+  "^Searching for source files",
+  "^Found \\d+ source file",
+  "^Pyright \\d+\\.\\d+",
+  "^basedpyright \\d+\\.\\d+",
+]
+max_lines = 50
+on_empty = "basedpyright: ok"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"sessionId":"c62a742d-ec0a-47a3-887a-bbdbe70b0024","pid":1688229,"acquiredAt":1776659743194}`