refactor: remove scoring heuristic, use noise filter + structured metadata (#824)

jackwener · web-flow · commit 009c25b955af · 2026-04-06T03:21:15.000+08:00
* refactor: remove scoring heuristic, replace with noise filter + metadata

The scoring mechanism was a pre-LLM heuristic that compressed rich endpoint
metadata into a single number. Since this project is designed for AI Agents,
the agent can reason about structured metadata directly.

Changes:
- Remove scoreEndpoint/scoreRequest/scoreWriteRequest and all score fields
- Replace with isNoiseUrl() filter (tracking/beacon/pixel) + isUsefulEndpoint()
- Remove artificial confidence percentages (was score/20)
- Sort by itemCount (transparent, observable) instead of weighted score
- Endpoints now expose full structured metadata for agent consumption
- Net reduction: -43 lines

* fix: widen endpoint filter to keep single-object JSON and stats/metric URLs

- Remove stats/metric from noise pattern — these are often business APIs
- Relax isUsefulEndpoint to keep any JSON endpoint, not just arrays
  (preserves /me, /profile, /detail and other single-object APIs)

* fix: add deterministic endpoint ordering for generate/synthesize path

The AI agent path doesn't need ranking, but generate/synthesize still
pick candidates[0] as default — this needs a stable, explainable order.

- Add endpointSortKey() with transparent observable signals: array items,
  detected fields, API path patterns, query params
- Update synthesize chooseEndpoint fallback to use itemCount + field count
- Sort key is internal only; not exposed as score to external consumers
diff --git a/src/analysis.ts b/src/analysis.ts
@@ -150,30 +150,13 @@ export function detectAuthFromContent(url: string, body: unknown): string[] {
   return indicators;
 }
 
-// ── Shared scoring helpers ───────────────────────────────────────────────────
-
-/** URL-based score adjustments shared by explore and record scoring. */
-export function applyUrlScoreAdjustments(url: string, score: number): number {
-  let s = score;
-  if (url.includes('/api/') || url.includes('/x/')) s += 3;
-  if (url.match(/\/(track|log|analytics|beacon|pixel|stats|metric)/i)) s -= 10;
-  if (url.match(/\/(ping|heartbeat|keep.?alive)/i)) s -= 10;
-  return s;
-}
+// ── Noise filtering ─────────────────────────────────────────────────────────
 
-/** Score an array response based on item count and detected field roles. */
-export function scoreArrayResponse(arrayResult: ArrayDiscovery | null): number {
-  if (!arrayResult) return 0;
-  let s = 10;
-  s += Math.min(arrayResult.items.length, 10);
-  const sample = arrayResult.items[0];
-  if (sample && typeof sample === 'object') {
-    const keys = Object.keys(sample as object).map(k => k.toLowerCase());
-    for (const aliases of Object.values(FIELD_ROLES)) {
-      if (aliases.some(a => keys.includes(a))) s += 2;
-    }
-  }
-  return s;
+const NOISE_URL_PATTERN = /\/(track|log|analytics|beacon|pixel|ping|heartbeat|keep.?alive)\b/i;
+
+/** Check whether a URL looks like tracking/telemetry noise rather than a business API. */
+export function isNoiseUrl(url: string): boolean {
+  return NOISE_URL_PATTERN.test(url);
 }
 
 // ── Query param classification ──────────────────────────────────────────────
diff --git a/src/explore.ts b/src/explore.ts
@@ -25,6 +25,7 @@ import {
   inferStrategy,
   detectAuthFromHeaders,
   classifyQueryParams,
+  isNoiseUrl,
 } from './analysis.js';
 
 // ── Site name detection ────────────────────────────────────────────────────
@@ -67,14 +68,14 @@ interface NetworkEntry {
 
 interface AnalyzedEndpoint {
   pattern: string; method: string; url: string; status: number | null;
-  contentType: string; queryParams: string[]; score: number;
+  contentType: string; queryParams: string[];
   hasSearchParam: boolean; hasPaginationParam: boolean; hasLimitParam: boolean;
   authIndicators: string[];
   responseAnalysis: { itemPath: string | null; itemCount: number; detectedFields: Record<string, string>; sampleFields: string[] } | null;
 }
 
 interface InferredCapability {
-  name: string; description: string; strategy: string; confidence: number;
+  name: string; description: string; strategy: string;
   endpoint: string; itemPath: string | null;
   recommendedColumns: string[];
   recommendedArgs: Array<{ name: string; type: string; required: boolean; default?: unknown }>;
@@ -104,7 +105,6 @@ export interface ExploreEndpointArtifact {
   url: string;
   status: number | null;
   contentType: string;
-  score: number;
   queryParams: string[];
   itemPath: string | null;
   itemCount: number;
@@ -194,17 +194,29 @@ function isBooleanRecord(value: unknown): value is Record<string, boolean> {
     && Object.values(value as Record<string, unknown>).every(v => typeof v === 'boolean');
 }
 
-function scoreEndpoint(ep: { contentType: string; responseAnalysis: AnalyzedEndpoint['responseAnalysis']; pattern: string; status: number | null; hasSearchParam: boolean; hasPaginationParam: boolean; hasLimitParam: boolean }): number {
-  let s = 0;
-  if (ep.contentType.includes('json')) s += 10;
-  if (ep.responseAnalysis) { s += 5; s += Math.min(ep.responseAnalysis.itemCount, 10); s += Object.keys(ep.responseAnalysis.detectedFields).length * 2; }
-  if (ep.pattern.includes('/api/') || ep.pattern.includes('/x/')) s += 3;
-  if (ep.hasSearchParam) s += 3;
-  if (ep.hasPaginationParam) s += 2;
-  if (ep.hasLimitParam) s += 2;
-  if (ep.status === 200) s += 2;
-  if (ep.responseAnalysis && ep.responseAnalysis.itemCount === 0 && ep.contentType.includes('json')) s -= 3;
-  return s;
+/**
+ * Deterministic sort key for endpoint ordering — transparent, observable signals only.
+ * Used by generate/synthesize to pick a stable default candidate.
+ * Not exposed externally; AI agents see the raw metadata and decide for themselves.
+ */
+function endpointSortKey(ep: AnalyzedEndpoint): number {
+  let k = 0;
+  // Prefer endpoints with array data (list APIs are more useful for automation)
+  const items = ep.responseAnalysis?.itemCount ?? 0;
+  if (items > 0) k += 100 + Math.min(items, 50);
+  // Prefer endpoints with detected semantic fields
+  k += Object.keys(ep.responseAnalysis?.detectedFields ?? {}).length * 10;
+  // Prefer API-style paths
+  if (ep.pattern.includes('/api/') || ep.pattern.includes('/x/')) k += 5;
+  // Prefer endpoints with query params (more likely to be parameterized APIs)
+  if (ep.hasSearchParam || ep.hasPaginationParam || ep.hasLimitParam) k += 5;
+  return k;
+}
+
+/** Check whether an endpoint carries useful structured data (any JSON response, not noise). */
+function isUsefulEndpoint(ep: AnalyzedEndpoint): boolean {
+  if (isNoiseUrl(ep.url)) return false;
+  return ep.contentType.includes('json');
 }
 
 
@@ -229,7 +241,7 @@ const INTERACT_FUZZ_JS = interactFuzz.toString();
 
 // ── Analysis helpers (extracted from exploreUrl) ───────────────────────────
 
-/** Filter, deduplicate, and score network endpoints. */
+/** Filter and deduplicate network endpoints, keeping only useful structured-data APIs. */
 function analyzeEndpoints(networkEntries: NetworkEntry[]): { analyzed: AnalyzedEndpoint[]; totalCount: number } {
   const seen = new Map<string, AnalyzedEndpoint>();
   for (const entry of networkEntries) {
@@ -251,13 +263,14 @@ function analyzeEndpoints(networkEntries: NetworkEntry[]): { analyzed: AnalyzedE
       hasLimitParam: hasLimit || qp.some(p => LIMIT_PARAMS.has(p)),
       authIndicators: detectAuthFromHeaders(entry.requestHeaders),
       responseAnalysis: entry.responseBody ? analyzeResponseBody(entry.responseBody) : null,
-      score: 0,
     };
-    ep.score = scoreEndpoint(ep);
     seen.set(key, ep);
   }
 
-  const analyzed = [...seen.values()].filter(ep => ep.score >= 5).sort((a, b) => b.score - a.score);
+  // Filter to useful endpoints; deterministic ordering by observable metadata signals
+  const analyzed = [...seen.values()]
+    .filter(isUsefulEndpoint)
+    .sort((a, b) => endpointSortKey(b) - endpointSortKey(a));
   return { analyzed, totalCount: seen.size };
 }
 
@@ -305,7 +318,7 @@ function inferCapabilitiesFromEndpoints(
     capabilities.push({
       name: capName, description: `${opts.site ?? detectSiteName(opts.url)} ${capName}`,
       strategy: storeHint ? 'store-action' : epStrategy,
-      confidence: Math.min(ep.score / 20, 1.0), endpoint: ep.pattern,
+      endpoint: ep.pattern,
       itemPath: ep.responseAnalysis?.itemPath ?? null,
       recommendedColumns: cols.length ? cols : ['title', 'url'],
       recommendedArgs: args,
@@ -337,7 +350,7 @@ async function writeExploreArtifacts(
     }, null, 2)),
     fs.promises.writeFile(path.join(targetDir, 'endpoints.json'), JSON.stringify(analyzedEndpoints.map(ep => ({
       pattern: ep.pattern, method: ep.method, url: ep.url, status: ep.status,
-      contentType: ep.contentType, score: ep.score, queryParams: ep.queryParams,
+      contentType: ep.contentType, queryParams: ep.queryParams,
       itemPath: ep.responseAnalysis?.itemPath ?? null, itemCount: ep.responseAnalysis?.itemCount ?? 0,
       detectedFields: ep.responseAnalysis?.detectedFields ?? {}, authIndicators: ep.authIndicators,
     })), null, 2)),
@@ -485,7 +498,7 @@ export function renderExploreSummary(result: ExploreResult): string {
   ];
   for (const cap of (result.capabilities ?? []).slice(0, 5)) {
     const storeInfo = cap.storeHint ? ` → ${cap.storeHint.store}.${cap.storeHint.action}()` : '';
-    lines.push(`  • ${cap.name} (${cap.strategy}, ${(cap.confidence * 100).toFixed(0)}%)${storeInfo}`);
+    lines.push(`  • ${cap.name} (${cap.strategy})${storeInfo}`);
   }
   const fw = result.framework ?? {};
   const fwNames = Object.entries(fw).filter(([, v]) => v).map(([k]) => k);
diff --git a/src/generate.ts b/src/generate.ts
@@ -38,7 +38,7 @@ export interface GenerateCliResult {
   };
   synthesize: {
     candidate_count: number;
-    candidates: Array<Pick<SynthesizeCandidateSummary, 'name' | 'strategy' | 'confidence'>>;
+    candidates: Array<Pick<SynthesizeCandidateSummary, 'name' | 'strategy'>>;
   };
 }
 
@@ -71,7 +71,7 @@ function normalizeGoal(goal?: string | null): string | null {
  */
 function selectCandidate(candidates: SynthesizeResult['candidates'], goal?: string | null): SynthesizeCandidateSummary | null {
   if (!candidates.length) return null;
-  if (!goal) return candidates[0]; // highest confidence first
+  if (!goal) return candidates[0];
 
   const normalized = normalizeGoal(goal);
   if (normalized) {
@@ -127,7 +127,6 @@ export async function generateCliFromUrl(opts: GenerateCliOptions): Promise<Gene
       candidates: (synthesizeResult.candidates ?? []).map((c) => ({
         name: c.name,
         strategy: c.strategy,
-        confidence: c.confidence,
       })),
     },
   };
@@ -150,7 +149,7 @@ export function renderGenerateSummary(r: GenerateCliResult): string {
   ];
 
   for (const c of r.synthesize?.candidates ?? []) {
-    lines.push(`    • ${c.name} (${c.strategy}, ${((c.confidence ?? 0) * 100).toFixed(0)}%)`);
+    lines.push(`    • ${c.name} (${c.strategy})`);
   }
 
   const fw = r.explore?.framework ?? {};
diff --git a/src/record.ts b/src/record.ts
@@ -27,8 +27,7 @@ import {
   inferStrategy,
   detectAuthFromContent,
   classifyQueryParams,
-  applyUrlScoreAdjustments,
-  scoreArrayResponse,
+  isNoiseUrl,
 } from './analysis.js';
 
 // ── Types ──────────────────────────────────────────────────────────────────
@@ -64,7 +63,6 @@ type RecordedCandidateKind = 'read' | 'write';
 export interface RecordedCandidate {
   kind: RecordedCandidateKind;
   req: RecordedRequest;
-  score: number;
   arrayResult: ReturnType<typeof findArrayPath> | null;
 }
 
@@ -75,18 +73,11 @@ interface GeneratedRecordedCandidate {
   yaml: unknown;
 }
 
-/** Keep the stronger candidate when multiple recordings share one bucket. */
-function preferRecordedCandidate(current: RecordedCandidate, next: RecordedCandidate): RecordedCandidate {
-  if (next.score > current.score) return next;
-  if (next.score < current.score) return current;
+/** Keep the later candidate when multiple recordings share one bucket (prefer fresher data). */
+function preferRecordedCandidate(_current: RecordedCandidate, next: RecordedCandidate): RecordedCandidate {
   return next;
 }
 
-/** Apply shared endpoint score tweaks. */
-function applyCommonEndpointScoreAdjustments(req: RecordedRequest, score: number): number {
-  return applyUrlScoreAdjustments(req.url, score);
-}
-
 /** Build a candidate-level dedupe key. */
 function getRecordedCandidateKey(candidate: RecordedCandidate): string {
   return `${candidate.kind} ${getRecordedRequestKey(candidate.req)}`;
@@ -327,10 +318,6 @@ function generateReadRecordedJs(): string {
 
 // ── Analysis helpers ───────────────────────────────────────────────────────
 
-function scoreRequest(req: RecordedRequest, arrayResult: ReturnType<typeof findArrayPath> | null): number {
-  return applyCommonEndpointScoreAdjustments(req, scoreArrayResponse(arrayResult));
-}
-
 /** Check whether one recorded request is safe to treat as a write candidate. */
 function isWriteCandidate(req: RecordedRequest): boolean {
   return ['POST', 'PUT', 'PATCH'].includes(req.method)
@@ -343,24 +330,18 @@ function isWriteCandidate(req: RecordedRequest): boolean {
     && !Array.isArray(req.responseBody);
 }
 
-/** Score replayable write requests while keeping tracking and heartbeat traffic suppressed. */
-function scoreWriteRequest(req: RecordedRequest): number {
-  return applyCommonEndpointScoreAdjustments(req, 6);
-}
-
-/** Analyze recorded requests into read and write candidates. */
+/** Analyze recorded requests into read and write candidates, filtering out noise. */
 export function analyzeRecordedRequests(requests: RecordedRequest[]): { candidates: RecordedCandidate[] } {
   const candidates: RecordedCandidate[] = [];
   for (const req of requests) {
+    if (isNoiseUrl(req.url)) continue;
     const arrayResult = findArrayPath(req.responseBody);
     if (isWriteCandidate(req)) {
-      const score = scoreWriteRequest(req);
-      if (score > 0) candidates.push({ kind: 'write', req, score, arrayResult: null });
+      candidates.push({ kind: 'write', req, arrayResult: null });
       continue;
     }
     if (arrayResult) {
-      const score = scoreRequest(req, arrayResult);
-      if (score > 0) candidates.push({ kind: 'read', req, score, arrayResult });
+      candidates.push({ kind: 'read', req, arrayResult });
     }
   }
   return { candidates };
@@ -532,9 +513,9 @@ export function generateRecordedCandidates(
     deduped.set(key, current ? preferRecordedCandidate(current, candidate) : candidate);
   }
 
+  // Sort reads by array item count (richer data first), then take top 5
   const selected = [...deduped.values()]
-    .filter((candidate) => candidate.kind === 'read' ? candidate.score >= 8 : candidate.score >= 6)
-    .sort((a, b) => b.score - a.score)
+    .sort((a, b) => (b.arrayResult?.items.length ?? 0) - (a.arrayResult?.items.length ?? 0))
     .slice(0, 5);
 
   const usedNames = new Set<string>();
@@ -741,14 +722,14 @@ function analyzeAndWrite(
   const candidates: RecordResult['candidates'] = [];
   const usedNames = new Set<string>();
 
-  console.log(chalk.bold('\n  Captured endpoints (scored):\n'));
+  console.log(chalk.bold('\n  Captured endpoints:\n'));
 
-  for (const entry of analysis.candidates.sort((a, b) => b.score - a.score).slice(0, 8)) {
+  for (const entry of analysis.candidates.sort((a, b) => (b.arrayResult?.items.length ?? 0) - (a.arrayResult?.items.length ?? 0)).slice(0, 8)) {
     const itemCount = entry.arrayResult?.items.length ?? 0;
     const strategy = entry.kind === 'write'
       ? 'cookie'
       : inferStrategy(detectAuthFromContent(entry.req.url, entry.req.responseBody));
-    const marker = entry.score >= 15 ? chalk.green('★') : entry.score >= 8 ? chalk.yellow('◆') : chalk.dim('·');
+    const marker = entry.kind === 'write' ? chalk.magenta('✎') : itemCount > 5 ? chalk.green('★') : chalk.dim('·');
     console.log(
       `  ${marker} ${chalk.white(urlToPattern(entry.req.url))}` +
       chalk.dim(` [${strategy}]`) +
@@ -777,7 +758,7 @@ function analyzeAndWrite(
   }
 
   if (candidates.length === 0) {
-    console.log(chalk.yellow('  No high-confidence candidates found.'));
+    console.log(chalk.yellow('  No candidates found.'));
     console.log(chalk.dim('  Tip: make sure you triggered JSON API calls (open lists, search, scroll).'));
   }
 
diff --git a/src/synthesize.ts b/src/synthesize.ts
@@ -26,7 +26,6 @@ export interface SynthesizeCapability {
   name: string;
   description: string;
   strategy: string;
-  confidence?: number;
   endpoint?: string;
   itemPath?: string | null;
   recommendedColumns?: string[];
@@ -67,7 +66,6 @@ export interface SynthesizeCandidateSummary {
   name: string;
   path: string;
   strategy: string;
-  confidence?: number;
 }
 
 export interface SynthesizeResult {
@@ -98,7 +96,6 @@ export function synthesizeFromExplore(
 
   const site = bundle.manifest.site;
   const capabilities = (bundle.capabilities ?? [])
-    .sort((a, b) => (b.confidence ?? 0) - (a.confidence ?? 0))
     .slice(0, opts.top ?? 3);
   const candidates: SynthesizeCandidateSummary[] = [];
 
@@ -108,7 +105,7 @@ export function synthesizeFromExplore(
     const candidate = buildCandidateYaml(site, bundle.manifest, cap, endpoint);
     const filePath = path.join(targetDir, `${candidate.name}.yaml`);
     fs.writeFileSync(filePath, yaml.dump(candidate.yaml, { sortKeys: false, lineWidth: 120 }));
-    candidates.push({ name: candidate.name, path: filePath, strategy: cap.strategy, confidence: cap.confidence });
+    candidates.push({ name: candidate.name, path: filePath, strategy: cap.strategy });
   }
 
   const index = { site, target_url: bundle.manifest.target_url, generated_from: exploreDir, candidate_count: candidates.length, candidates };
@@ -119,7 +116,7 @@ export function synthesizeFromExplore(
 
 export function renderSynthesizeSummary(result: SynthesizeResult): string {
   const lines = ['opencli synthesize: OK', `Site: ${result.site}`, `Source: ${result.explore_dir}`, `Candidates: ${result.candidate_count}`];
-  for (const c of result.candidates ?? []) lines.push(`  • ${c.name} (${c.strategy}, ${((c.confidence ?? 0) * 100).toFixed(0)}% confidence) → ${c.path}`);
+  for (const c of result.candidates ?? []) lines.push(`  • ${c.name} (${c.strategy}) → ${c.path}`);
   return lines.join('\n');
 }
 
@@ -147,7 +144,12 @@ function chooseEndpoint(cap: SynthesizeCapability, endpoints: ExploreEndpointArt
     const match = endpoints.find((endpoint) => endpoint.pattern === endpointPattern || endpoint.url?.includes(endpointPattern));
     if (match) return match;
   }
-  return [...endpoints].sort((a, b) => (b.score ?? 0) - (a.score ?? 0))[0];
+  // Fallback: prefer endpoint with most data (item count + detected fields)
+  return [...endpoints].sort((a, b) => {
+    const aKey = (a.itemCount ?? 0) * 10 + Object.keys(a.detectedFields ?? {}).length;
+    const bKey = (b.itemCount ?? 0) * 10 + Object.keys(b.detectedFields ?? {}).length;
+    return bKey - aKey;
+  })[0];
 }
 
 // ── URL templating ─────────────────────────────────────────────────────────