reaatech
diff --git a/‎skills/cost-tracking/skill.md‎
Lines changed: 67 additions & 31 deletions b/‎skills/cost-tracking/skill.md‎
Lines changed: 67 additions & 31 deletions
diff --git a/‎skills/eval-gating/skill.md‎
Lines changed: 131 additions & 32 deletions b/‎skills/eval-gating/skill.md‎
Lines changed: 131 additions & 32 deletions
@@ -2,7 +2,7 @@
 
 ## What It Is
 
-Cost tracking calculates per-task and per-trajectory expenses, including LLM API costs, tool invocation costs, and judge evaluation costs. It enforces budgets and provides cost optimization insights.
+Cost tracking calculates per-task and per-trajectory expenses, including LLM API costs, tool invocation costs, and judge evaluation costs. It enforces budgets with 3-tier alert thresholds (50% log, 75% notify, 90% block) and provides cost optimization insights.
 
 ## Why It Matters
 
@@ -13,52 +13,67 @@ Cost tracking calculates per-task and per-trajectory expenses, including LLM API
 
 ## How to Use It
 
-### Track Costs
+### CLI: Eval with Budget
 
 ```bash
 npx agent-eval-harness eval trajectories/*.jsonl \
   --budget 10.00 \
   --output results/
 ```
 
-### Cost Breakdown
+### Calculate Trajectory Cost
 
 ```typescript
-import { calculateTrajectoryCost } from '@reaatech/agent-eval-harness';
+import { calculateTrajectoryCost, DEFAULT_PRICING } from '@reaatech/agent-eval-harness';
 
-const pricing = {
-  'claude-opus': { input: 15.00, output: 75.00 },
-  'gpt-4-turbo': { input: 10.00, output: 30.00 },
-};
+// Uses built-in pricing for 8 models (claude-opus, claude-sonnet, claude-haiku,
+// gpt-4-turbo, gpt-4, gpt-4-mini, gemini-pro, gemini-flash)
+const cost = calculateTrajectoryCost(trajectory, 'claude-opus');
 
-const breakdown = await calculateTrajectoryCost('trajectories/run.jsonl', pricing);
-
-console.log(`Total Cost: $${breakdown.total_cost}`);
-console.log(`LLM Calls: $${breakdown.llm_calls}`);
-console.log(`Tool Invocations: $${breakdown.tool_invocations}`);
-console.log(`Judge Evaluations: $${breakdown.judge_evaluations}`);
+console.log(`Total: $${formatCost(cost.total_cost)}`);
+console.log(`LLM Calls: $${formatCost(cost.llm_calls)}`);
+console.log(`Tool Invocations: $${formatCost(cost.tool_invocations)}`);
+console.log(`Per-turn breakdown:`, cost.per_turn);
 ```
 
-### Budget Alerts
+### Budget Enforcement
 
 ```typescript
-import { checkBudget, createBudget } from '@reaatech/agent-eval-harness';
-
-const budget = createBudget({
-  per_task: 0.05,
-  per_trajectory: 1.00,
-  daily: 100.00,
-  alerts: [
-    { threshold: 0.5, action: 'log' },
-    { threshold: 0.75, action: 'notify' },
-    { threshold: 0.9, action: 'block' },
-  ],
-});
-
-const status = await checkBudget(currentSpend, budget);
-if (!status.within_budget) {
-  console.warn(`Budget exceeded: ${status.percentage}% used`);
+import { checkBudget, createBudget, CostTracker } from '@reaatech/agent-eval-harness';
+
+// 3 budget presets: strict, moderate, lenient
+const budget = createBudget('moderate');
+
+// checkBudget(cost: CostBreakdown, budget: BudgetConfig, thresholds?)
+const status = checkBudget(cost, budget);
+
+if (!status.withinBudget) {
+  console.warn(`Budget exceeded: ${status.usagePercentage}% used`);
 }
+
+// Track cumulative costs
+const tracker = new CostTracker({ per_trajectory: 1.00, daily: 100.00 });
+tracker.recordCost(cost);
+console.log(`Daily total: $${formatCost(tracker.getDailyTotal())}`);
+```
+
+### Cost Reporting
+
+```typescript
+import {
+  generateCostReport,
+  exportToCsv,
+  exportToJson,
+  generateSummary,
+  formatCost,
+} from '@reaatech/agent-eval-harness';
+
+const report = generateCostReport(trajectories);
+console.log(formatCost(report.totalCost));
+
+const csv = exportToCsv(report);
+const json = exportToJson(report);
+const summary = generateSummary(report);
 ```
 
 ## Key Metrics
@@ -73,6 +88,27 @@ if (!status.within_budget) {
 | `tool_cost` | Tool invocation costs | USD |
 | `judge_cost` | LLM judge costs | USD |
 
+## Supported Models (DEFAULT_PRICING)
+
+| Model | Input ($/M tokens) | Output ($/M tokens) |
+|-------|-------------------|---------------------|
+| claude-opus | $15.00 | $75.00 |
+| claude-sonnet | $3.00 | $15.00 |
+| claude-haiku | $0.25 | $1.25 |
+| gpt-4-turbo | $10.00 | $30.00 |
+| gpt-4 | $30.00 | $60.00 |
+| gpt-4-mini | $0.15 | $0.60 |
+| gemini-pro | $2.50 | $7.50 |
+| gemini-flash | $0.50 | $1.50 |
+
+## Budget Presets
+
+| Preset | Per Task | Per Trajectory | Daily |
+|--------|----------|----------------|-------|
+| `strict` | $0.02 | $0.50 | $50.00 |
+| `moderate` | $0.05 | $1.00 | $100.00 |
+| `lenient` | $0.10 | $2.00 | $250.00 |
+
 ## Best Practices
 
 1. **Set budget limits** — Define per-task, per-trajectory, and daily budgets
 
@@ -2,7 +2,7 @@
 
 ## What It Is
 
-Eval gating uses evaluation results to make pass/fail decisions in CI/CD pipelines. It checks metrics against thresholds and baselines, blocking deployments when quality standards aren't met.
+Eval gating uses evaluation results to make pass/fail decisions in CI/CD pipelines. It checks metrics against thresholds and baselines using 4 gate types (threshold, baseline-comparison, regression, custom) with 6 comparison operators. Blocks deployments when quality standards aren't met.
 
 ## Why It Matters
 
@@ -13,16 +13,15 @@ Eval gating uses evaluation results to make pass/fail decisions in CI/CD pipelin
 
 ## How to Use It
 
-### Run Gate Evaluation
+### CLI: Run Gate Check
 
 ```bash
-npx agent-eval-harness gate \
-  --results results/eval-123.json \
-  --gates gates.yaml \
-  --baseline results/baseline.json
+npx agent-eval-harness gate results/results.json \
+  --preset standard \
+  --exit-code
 ```
 
-### Gate Configuration
+### Gate Configuration (YAML)
 
 ```yaml
 # gates.yaml
@@ -64,43 +63,143 @@ gates:
     threshold: 0.85
 ```
 
+### Gate Presets
+
+Three named presets for quick setup:
+
+| Preset | Overall Quality | Cost | Latency P99 | Tool Correctness | Faithfulness |
+|--------|----------------|------|-------------|------------------|--------------|
+| **standard** | >= 0.80 | <= $0.05 | <= 5000ms | >= 0.95 | >= 0.85 |
+| **strict** | >= 0.90 | <= $0.03 | <= 3000ms | >= 0.98 | >= 0.90 |
+| **lenient** | >= 0.70 | <= $0.10 | <= 10000ms | >= 0.85 | >= 0.75 |
+
 ### Programmatic Gate Evaluation
 
 ```typescript
-import { createGateEngine } from '@reaatech/agent-eval-harness';
-
-const engine = createGateEngine([
-  { name: 'quality', metric: 'overall_score', operator: '>=', threshold: 0.80 },
-  { name: 'cost', metric: 'avg_cost_per_task', operator: '<=', threshold: 0.05 },
+import {
+  createGateEngine,
+  getStandardPreset,
+  getStrictPreset,
+  getLenientPreset,
+  CIIntegration,
+} from '@reaatech/agent-eval-harness';
+
+// Use a preset
+const presets = getStandardPreset();
+const engine = createGateEngine(presets.gates);
+
+// Or build custom gates
+const engine2 = createGateEngine([
+  { name: 'quality', type: 'threshold', metric: 'overall_score',
+    operator: '>=', threshold: 0.80 },
+  { name: 'cost', type: 'threshold', metric: 'avg_cost_per_task',
+    operator: '<=', threshold: 0.05 },
 ]);
 
-const result = await engine.evaluate(aggregatedResults);
+// evaluate() is synchronous
+const summary = engine.evaluate(aggregatedResults);
 
-if (result.passed) {
-  console.log('✅ All gates passed');
+if (summary.overallPassed) {
+  console.log('All gates passed');
   process.exit(0);
 } else {
-  console.log('❌ Gates failed:');
-  for (const failure of result.failures) {
-    console.log(`  - ${failure.gate}: ${failure.actual} (expected ${failure.expected})`);
+  console.log('Gates failed:');
+  for (const r of summary.results.filter(r => !r.passed)) {
+    console.log(`  ${r.name}: ${r.actualValue} (threshold: ${r.threshold})`);
   }
   process.exit(1);
 }
 ```
 
+### Custom Gate Factories
+
+```typescript
+import {
+  createOverallQualityGate,
+  createCostGate,
+  createLatencyGate,
+  createFaithfulnessGate,
+  createToolCorrectnessGate,
+  createNoRegressionGate,
+  createPassRateGate,
+  createSLAViolationsGate,
+  createImprovementGate,
+  createSignificanceGate,
+  createMetricRegressionGate,
+} from '@reaatech/agent-eval-harness';
+
+const gates = [
+  createOverallQualityGate(0.85),
+  createCostGate(0.05),
+  createLatencyGate(5000),
+  createNoRegressionGate(baselineResults, 'overall_score'),
+];
+
+const engine = createGateEngine(gates);
+```
+
 ### CI Integration
 
+```typescript
+import {
+  CIIntegration,
+  writeJUnitReport,
+  outputGitHubAnnotations,
+  setGitHubOutput,
+  exportForCI,
+} from '@reaatech/agent-eval-harness';
+
+const summary = engine.evaluate(results);
+
+// GitHub Annotations for PR
+const annotations = CIIntegration.generateGitHubAnnotations(summary);
+annotations.forEach(a => console.log(a));
+
+// JUnit XML for test reporters
+writeJUnitReport(summary, './reports/gates.xml');
+
+// GitHub Actions step outputs
+setGitHubOutput(summary);
+
+// Get CI exit code (0 = pass, 1 = failure)
+const exitCode = CIIntegration.getExitCode(summary);
+process.exit(exitCode);
+
+// Full CI export (annotations + JUnit + outputs + env vars)
+exportForCI(summary, './reports/', process.env);
+```
+
+### GitHub Actions Workflow
+
 ```yaml
-# .github/workflows/ci.yml
-- name: Run evaluation
-  run: npx agent-eval-harness eval trajectories/*.jsonl --output results/
-
-- name: Check gates
-  run: |
-    npx agent-eval-harness gate \
-      --results results/eval.json \
-      --gates gates.yaml \
-      --baseline results/baseline.json
+name: Agent Evaluation
+on:
+  pull_request:
+    branches: [main]
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run evaluation
+        run: |
+          npx agent-eval-harness eval trajectories/*.jsonl \
+            --config eval-config.yaml \
+            --output results/
+
+      - name: Check gates
+        run: |
+          npx agent-eval-harness gate results/results.json \
+            --preset standard \
+            --exit-code
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results
+          path: results/
 ```
 
 ## Key Metrics
@@ -115,10 +214,10 @@ if (result.passed) {
 
 ## Gate Types
 
-1. **Threshold Gates** — Check metric against fixed threshold
-2. **Baseline Gates** — Compare against previous run
-3. **Statistical Gates** — Require statistical significance
-4. **Composite Gates** — Combine multiple metrics
+1. **Threshold Gates** — Check metric against fixed value with comparison operators (`>=`, `<=`, `>`, `<`, `==`, `!=`)
+2. **Baseline-Comparison Gates** — Compare against previous run with regression/improvement detection
+3. **Regression Gates** — Detect specific metric regressions from a baseline
+4. **Custom Gates** — Arbitrary evaluation functions returning pass/fail
 
 ## Best Practices