Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 27 additions & 24 deletions Releases/v5.0.0/.claude/hooks/SatisfactionCapture.hook.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,19 @@ const SYSTEM_TEXT_PATTERNS = [
/^Note:.*was read before/i,
];

// ── Rating Validity ──

/**
* A satisfaction rating is only meaningful when the model returns an actual
* value in the 1-10 band. Anything else (missing, null, out of range, NaN,
* non-number) is the ABSENCE of a signal — it must not be coerced to a neutral
* 5, which would pollute every running average (the aggregator flat-means
* `.rating` and ignores confidence).
*/
export function isValidRating(x: unknown): boolean {
return typeof x === 'number' && Number.isFinite(x) && x >= 1 && x <= 10;
}

// ── Rating Writer ──

function writeRating(entry: RatingEntry): void {
Expand Down Expand Up @@ -422,10 +435,9 @@ async function main() {
level: 'fast',
});

if (result.success && result.parsed) {
if (result.success && result.parsed && isValidRating((result.parsed as SentimentResult).rating)) {
const r = result.parsed as SentimentResult;
// Clamp rating to 1-10, default 5 if missing
const rating = (r.rating != null && r.rating >= 1 && r.rating <= 10) ? r.rating : 5;
const rating = r.rating;
const confidence = r.confidence || 0.5;

console.error(`[SatisfactionCapture] Implicit: ${rating}/10 (${confidence}) - ${r.summary || 'no summary'}`);
Expand Down Expand Up @@ -459,31 +471,22 @@ async function main() {
}).catch((err) => console.error(`[SatisfactionCapture] Failure capture error: ${err}`));
}
}
} else if (result.success) {
// Inference succeeded but returned no usable 1-10 rating (missing/garbled
// JSON). That's the absence of a signal, not a neutral 5 — skip the write.
console.error(`[SatisfactionCapture] Inference returned no valid rating — skipping signal (no rating written)`);
} else {
// Inference failed — default to 5 (neutral)
// Inference failed — skip the signal instead of fabricating a neutral 5.
// A failed classification is not a real satisfaction signal; writing a
// default-5 row pollutes ratings.jsonl and drags every running average
// toward the mushy middle. Log for observability, write nothing.
const errorReason = result.error || 'unknown';
console.error(`[SatisfactionCapture] Inference failed: ${errorReason} — defaulting to 5`);
writeRating({
timestamp: getISOTimestamp(),
rating: 5,
session_id: sessionId,
source: 'implicit',
sentiment_summary: `Inference failed: ${errorReason.slice(0, 80)}`,
confidence: 0.3,
});
console.error(`[SatisfactionCapture] Inference failed: ${errorReason} — skipping signal (no rating written)`);
}
} catch (err) {
// Inference errored — default to 5 (neutral)
// Inference errored — skip the signal (see note above). Do not fabricate a 5.
const errMsg = err instanceof Error ? err.message : String(err);
console.error(`[SatisfactionCapture] Inference error: ${errMsg} — defaulting to 5`);
writeRating({
timestamp: getISOTimestamp(),
rating: 5,
session_id: sessionId,
source: 'implicit',
sentiment_summary: `Inference error: ${errMsg.slice(0, 80)}`,
confidence: 0.3,
});
console.error(`[SatisfactionCapture] Inference error: ${errMsg} — skipping signal (no rating written)`);
}

process.exit(0);
Expand All @@ -493,4 +496,4 @@ async function main() {
}
}

main();
if (import.meta.main) main();
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
# Regression test for SatisfactionCapture.hook.ts inference-failure handling.
#
# Asserts: when the satisfaction classifier fails (here forced by making the
# `claude` executable unavailable so Inference's spawn returns failure), the
# hook writes NO row to ratings.jsonl and logs that it skipped the signal.
#
# Before the fix, the failure branch wrote a placeholder {"rating":5,...} row;
# those accumulate and flatten every running average. This test guards against
# that regression.
#
# Safe by construction: all writes go to a throwaway PAI_DIR under a temp dir;
# the real ~/.claude tree is never touched.
#
# Usage: bash SatisfactionCapture.failure-path.test.sh (exit 0 = PASS)
set -uo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOOK="$SCRIPT_DIR/../SatisfactionCapture.hook.ts"
BUN="$(command -v bun || true)"
[ -z "$BUN" ] && { echo "SKIP: bun not found on PATH"; exit 0; }
[ -f "$HOOK" ] && : || { echo "FAIL: hook not found at $HOOK"; exit 1; }

TMP="$(mktemp -d)"
PAIDIR="$TMP/PAI"
RATINGS="$PAIDIR/MEMORY/LEARNING/SIGNALS/ratings.jsonl"
mkdir -p "$PAIDIR"
trap 'rm -rf "$TMP"' EXIT

# Long, non-praise, non-explicit-rating prompt → reaches the inference path.
STDIN='{"prompt":"this is still broken and not what I asked for at all","session_id":"test-failure-path","transcript_path":"/nonexistent-transcript"}'

# env -i with a PATH that excludes the `claude` binary → spawn() fails ENOENT
# → Inference returns success:false → exercises the failure branch deterministically.
printf '%s' "$STDIN" | env -i HOME="$HOME" PATH="/usr/bin:/bin" PAI_DIR="$PAIDIR" "$BUN" "$HOOK" >"$TMP/stdout.log" 2>"$TMP/stderr.log"

ROWS=0
[ -f "$RATINGS" ] && ROWS="$(wc -l < "$RATINGS" | tr -d ' ')"

echo "rows written on inference failure: $ROWS"
echo "stderr:"; grep -i "inference" "$TMP/stderr.log" | sed 's/^/ /' || true

if [ "$ROWS" = "0" ] && grep -q "skipping signal" "$TMP/stderr.log"; then
echo "PASS — failure path wrote no rating and logged skip."
exit 0
else
echo "FAIL — expected 0 rows + a 'skipping signal' log; got rows=$ROWS"
[ -f "$RATINGS" ] && { echo "unexpected row:"; sed 's/^/ /' "$RATINGS"; }
exit 1
fi
21 changes: 21 additions & 0 deletions Releases/v5.0.0/.claude/hooks/tests/isValidRating.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Unit test for SatisfactionCapture's rating-validity guard.
//
// The success path must only persist a rating the model actually returned in
// the 1-10 band. Anything else is the absence of a signal and must NOT be
// coerced into a neutral 5 (which pollutes every running average).
//
// Run: bun test isValidRating.test.ts
import { test, expect } from "bun:test";
import { isValidRating } from "../SatisfactionCapture.hook.ts";

test("accepts valid 1-10 ratings", () => {
for (const v of [1, 5, 10, 7.5]) {
expect(isValidRating(v)).toBe(true);
}
});

test("rejects values that must not become a fabricated 5", () => {
for (const v of [null, undefined, 0, 0.9, 11, -1, NaN, Infinity, -Infinity, "5", {}, [], true]) {
expect(isValidRating(v as unknown)).toBe(false);
}
});