Skip to content

Commit 009c25b

Browse files
authored
refactor: remove scoring heuristic, use noise filter + structured metadata (#824)
* refactor: remove scoring heuristic, replace with noise filter + metadata The scoring mechanism was a pre-LLM heuristic that compressed rich endpoint metadata into a single number. Since this project is designed for AI Agents, the agent can reason about structured metadata directly. Changes: - Remove scoreEndpoint/scoreRequest/scoreWriteRequest and all score fields - Replace with isNoiseUrl() filter (tracking/beacon/pixel) + isUsefulEndpoint() - Remove artificial confidence percentages (was score/20) - Sort by itemCount (transparent, observable) instead of weighted score - Endpoints now expose full structured metadata for agent consumption - Net reduction: -43 lines * fix: widen endpoint filter to keep single-object JSON and stats/metric URLs - Remove stats/metric from noise pattern — these are often business APIs - Relax isUsefulEndpoint to keep any JSON endpoint, not just arrays (preserves /me, /profile, /detail and other single-object APIs) * fix: add deterministic endpoint ordering for generate/synthesize path The AI agent path doesn't need ranking, but generate/synthesize still pick candidates[0] as default — this needs a stable, explainable order. - Add endpointSortKey() with transparent observable signals: array items, detected fields, API path patterns, query params - Update synthesize chooseEndpoint fallback to use itemCount + field count - Sort key is internal only; not exposed as score to external consumers
1 parent 5553300 commit 009c25b

5 files changed

Lines changed: 64 additions & 86 deletions

File tree

src/analysis.ts

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -150,30 +150,13 @@ export function detectAuthFromContent(url: string, body: unknown): string[] {
150150
return indicators;
151151
}
152152

153-
// ── Shared scoring helpers ───────────────────────────────────────────────────
154-
155-
/** URL-based score adjustments shared by explore and record scoring. */
156-
export function applyUrlScoreAdjustments(url: string, score: number): number {
157-
let s = score;
158-
if (url.includes('/api/') || url.includes('/x/')) s += 3;
159-
if (url.match(/\/(track|log|analytics|beacon|pixel|stats|metric)/i)) s -= 10;
160-
if (url.match(/\/(ping|heartbeat|keep.?alive)/i)) s -= 10;
161-
return s;
162-
}
153+
// ── Noise filtering ─────────────────────────────────────────────────────────
163154

164-
/** Score an array response based on item count and detected field roles. */
165-
export function scoreArrayResponse(arrayResult: ArrayDiscovery | null): number {
166-
if (!arrayResult) return 0;
167-
let s = 10;
168-
s += Math.min(arrayResult.items.length, 10);
169-
const sample = arrayResult.items[0];
170-
if (sample && typeof sample === 'object') {
171-
const keys = Object.keys(sample as object).map(k => k.toLowerCase());
172-
for (const aliases of Object.values(FIELD_ROLES)) {
173-
if (aliases.some(a => keys.includes(a))) s += 2;
174-
}
175-
}
176-
return s;
155+
const NOISE_URL_PATTERN = /\/(track|log|analytics|beacon|pixel|ping|heartbeat|keep.?alive)\b/i;
156+
157+
/** Check whether a URL looks like tracking/telemetry noise rather than a business API. */
158+
export function isNoiseUrl(url: string): boolean {
159+
return NOISE_URL_PATTERN.test(url);
177160
}
178161

179162
// ── Query param classification ──────────────────────────────────────────────

src/explore.ts

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import {
2525
inferStrategy,
2626
detectAuthFromHeaders,
2727
classifyQueryParams,
28+
isNoiseUrl,
2829
} from './analysis.js';
2930

3031
// ── Site name detection ────────────────────────────────────────────────────
@@ -67,14 +68,14 @@ interface NetworkEntry {
6768

6869
interface AnalyzedEndpoint {
6970
pattern: string; method: string; url: string; status: number | null;
70-
contentType: string; queryParams: string[]; score: number;
71+
contentType: string; queryParams: string[];
7172
hasSearchParam: boolean; hasPaginationParam: boolean; hasLimitParam: boolean;
7273
authIndicators: string[];
7374
responseAnalysis: { itemPath: string | null; itemCount: number; detectedFields: Record<string, string>; sampleFields: string[] } | null;
7475
}
7576

7677
interface InferredCapability {
77-
name: string; description: string; strategy: string; confidence: number;
78+
name: string; description: string; strategy: string;
7879
endpoint: string; itemPath: string | null;
7980
recommendedColumns: string[];
8081
recommendedArgs: Array<{ name: string; type: string; required: boolean; default?: unknown }>;
@@ -104,7 +105,6 @@ export interface ExploreEndpointArtifact {
104105
url: string;
105106
status: number | null;
106107
contentType: string;
107-
score: number;
108108
queryParams: string[];
109109
itemPath: string | null;
110110
itemCount: number;
@@ -194,17 +194,29 @@ function isBooleanRecord(value: unknown): value is Record<string, boolean> {
194194
&& Object.values(value as Record<string, unknown>).every(v => typeof v === 'boolean');
195195
}
196196

197-
function scoreEndpoint(ep: { contentType: string; responseAnalysis: AnalyzedEndpoint['responseAnalysis']; pattern: string; status: number | null; hasSearchParam: boolean; hasPaginationParam: boolean; hasLimitParam: boolean }): number {
198-
let s = 0;
199-
if (ep.contentType.includes('json')) s += 10;
200-
if (ep.responseAnalysis) { s += 5; s += Math.min(ep.responseAnalysis.itemCount, 10); s += Object.keys(ep.responseAnalysis.detectedFields).length * 2; }
201-
if (ep.pattern.includes('/api/') || ep.pattern.includes('/x/')) s += 3;
202-
if (ep.hasSearchParam) s += 3;
203-
if (ep.hasPaginationParam) s += 2;
204-
if (ep.hasLimitParam) s += 2;
205-
if (ep.status === 200) s += 2;
206-
if (ep.responseAnalysis && ep.responseAnalysis.itemCount === 0 && ep.contentType.includes('json')) s -= 3;
207-
return s;
197+
/**
198+
* Deterministic sort key for endpoint ordering — transparent, observable signals only.
199+
* Used by generate/synthesize to pick a stable default candidate.
200+
* Not exposed externally; AI agents see the raw metadata and decide for themselves.
201+
*/
202+
function endpointSortKey(ep: AnalyzedEndpoint): number {
203+
let k = 0;
204+
// Prefer endpoints with array data (list APIs are more useful for automation)
205+
const items = ep.responseAnalysis?.itemCount ?? 0;
206+
if (items > 0) k += 100 + Math.min(items, 50);
207+
// Prefer endpoints with detected semantic fields
208+
k += Object.keys(ep.responseAnalysis?.detectedFields ?? {}).length * 10;
209+
// Prefer API-style paths
210+
if (ep.pattern.includes('/api/') || ep.pattern.includes('/x/')) k += 5;
211+
// Prefer endpoints with query params (more likely to be parameterized APIs)
212+
if (ep.hasSearchParam || ep.hasPaginationParam || ep.hasLimitParam) k += 5;
213+
return k;
214+
}
215+
216+
/** Check whether an endpoint carries useful structured data (any JSON response, not noise). */
217+
function isUsefulEndpoint(ep: AnalyzedEndpoint): boolean {
218+
if (isNoiseUrl(ep.url)) return false;
219+
return ep.contentType.includes('json');
208220
}
209221

210222

@@ -229,7 +241,7 @@ const INTERACT_FUZZ_JS = interactFuzz.toString();
229241

230242
// ── Analysis helpers (extracted from exploreUrl) ───────────────────────────
231243

232-
/** Filter, deduplicate, and score network endpoints. */
244+
/** Filter and deduplicate network endpoints, keeping only useful structured-data APIs. */
233245
function analyzeEndpoints(networkEntries: NetworkEntry[]): { analyzed: AnalyzedEndpoint[]; totalCount: number } {
234246
const seen = new Map<string, AnalyzedEndpoint>();
235247
for (const entry of networkEntries) {
@@ -251,13 +263,14 @@ function analyzeEndpoints(networkEntries: NetworkEntry[]): { analyzed: AnalyzedE
251263
hasLimitParam: hasLimit || qp.some(p => LIMIT_PARAMS.has(p)),
252264
authIndicators: detectAuthFromHeaders(entry.requestHeaders),
253265
responseAnalysis: entry.responseBody ? analyzeResponseBody(entry.responseBody) : null,
254-
score: 0,
255266
};
256-
ep.score = scoreEndpoint(ep);
257267
seen.set(key, ep);
258268
}
259269

260-
const analyzed = [...seen.values()].filter(ep => ep.score >= 5).sort((a, b) => b.score - a.score);
270+
// Filter to useful endpoints; deterministic ordering by observable metadata signals
271+
const analyzed = [...seen.values()]
272+
.filter(isUsefulEndpoint)
273+
.sort((a, b) => endpointSortKey(b) - endpointSortKey(a));
261274
return { analyzed, totalCount: seen.size };
262275
}
263276

@@ -305,7 +318,7 @@ function inferCapabilitiesFromEndpoints(
305318
capabilities.push({
306319
name: capName, description: `${opts.site ?? detectSiteName(opts.url)} ${capName}`,
307320
strategy: storeHint ? 'store-action' : epStrategy,
308-
confidence: Math.min(ep.score / 20, 1.0), endpoint: ep.pattern,
321+
endpoint: ep.pattern,
309322
itemPath: ep.responseAnalysis?.itemPath ?? null,
310323
recommendedColumns: cols.length ? cols : ['title', 'url'],
311324
recommendedArgs: args,
@@ -337,7 +350,7 @@ async function writeExploreArtifacts(
337350
}, null, 2)),
338351
fs.promises.writeFile(path.join(targetDir, 'endpoints.json'), JSON.stringify(analyzedEndpoints.map(ep => ({
339352
pattern: ep.pattern, method: ep.method, url: ep.url, status: ep.status,
340-
contentType: ep.contentType, score: ep.score, queryParams: ep.queryParams,
353+
contentType: ep.contentType, queryParams: ep.queryParams,
341354
itemPath: ep.responseAnalysis?.itemPath ?? null, itemCount: ep.responseAnalysis?.itemCount ?? 0,
342355
detectedFields: ep.responseAnalysis?.detectedFields ?? {}, authIndicators: ep.authIndicators,
343356
})), null, 2)),
@@ -485,7 +498,7 @@ export function renderExploreSummary(result: ExploreResult): string {
485498
];
486499
for (const cap of (result.capabilities ?? []).slice(0, 5)) {
487500
const storeInfo = cap.storeHint ? ` → ${cap.storeHint.store}.${cap.storeHint.action}()` : '';
488-
lines.push(` • ${cap.name} (${cap.strategy}, ${(cap.confidence * 100).toFixed(0)}%)${storeInfo}`);
501+
lines.push(` • ${cap.name} (${cap.strategy})${storeInfo}`);
489502
}
490503
const fw = result.framework ?? {};
491504
const fwNames = Object.entries(fw).filter(([, v]) => v).map(([k]) => k);

src/generate.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ export interface GenerateCliResult {
3838
};
3939
synthesize: {
4040
candidate_count: number;
41-
candidates: Array<Pick<SynthesizeCandidateSummary, 'name' | 'strategy' | 'confidence'>>;
41+
candidates: Array<Pick<SynthesizeCandidateSummary, 'name' | 'strategy'>>;
4242
};
4343
}
4444

@@ -71,7 +71,7 @@ function normalizeGoal(goal?: string | null): string | null {
7171
*/
7272
function selectCandidate(candidates: SynthesizeResult['candidates'], goal?: string | null): SynthesizeCandidateSummary | null {
7373
if (!candidates.length) return null;
74-
if (!goal) return candidates[0]; // highest confidence first
74+
if (!goal) return candidates[0];
7575

7676
const normalized = normalizeGoal(goal);
7777
if (normalized) {
@@ -127,7 +127,6 @@ export async function generateCliFromUrl(opts: GenerateCliOptions): Promise<Gene
127127
candidates: (synthesizeResult.candidates ?? []).map((c) => ({
128128
name: c.name,
129129
strategy: c.strategy,
130-
confidence: c.confidence,
131130
})),
132131
},
133132
};
@@ -150,7 +149,7 @@ export function renderGenerateSummary(r: GenerateCliResult): string {
150149
];
151150

152151
for (const c of r.synthesize?.candidates ?? []) {
153-
lines.push(` • ${c.name} (${c.strategy}, ${((c.confidence ?? 0) * 100).toFixed(0)}%)`);
152+
lines.push(` • ${c.name} (${c.strategy})`);
154153
}
155154

156155
const fw = r.explore?.framework ?? {};

src/record.ts

Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@ import {
2727
inferStrategy,
2828
detectAuthFromContent,
2929
classifyQueryParams,
30-
applyUrlScoreAdjustments,
31-
scoreArrayResponse,
30+
isNoiseUrl,
3231
} from './analysis.js';
3332

3433
// ── Types ──────────────────────────────────────────────────────────────────
@@ -64,7 +63,6 @@ type RecordedCandidateKind = 'read' | 'write';
6463
export interface RecordedCandidate {
6564
kind: RecordedCandidateKind;
6665
req: RecordedRequest;
67-
score: number;
6866
arrayResult: ReturnType<typeof findArrayPath> | null;
6967
}
7068

@@ -75,18 +73,11 @@ interface GeneratedRecordedCandidate {
7573
yaml: unknown;
7674
}
7775

78-
/** Keep the stronger candidate when multiple recordings share one bucket. */
79-
function preferRecordedCandidate(current: RecordedCandidate, next: RecordedCandidate): RecordedCandidate {
80-
if (next.score > current.score) return next;
81-
if (next.score < current.score) return current;
76+
/** Keep the later candidate when multiple recordings share one bucket (prefer fresher data). */
77+
function preferRecordedCandidate(_current: RecordedCandidate, next: RecordedCandidate): RecordedCandidate {
8278
return next;
8379
}
8480

85-
/** Apply shared endpoint score tweaks. */
86-
function applyCommonEndpointScoreAdjustments(req: RecordedRequest, score: number): number {
87-
return applyUrlScoreAdjustments(req.url, score);
88-
}
89-
9081
/** Build a candidate-level dedupe key. */
9182
function getRecordedCandidateKey(candidate: RecordedCandidate): string {
9283
return `${candidate.kind} ${getRecordedRequestKey(candidate.req)}`;
@@ -327,10 +318,6 @@ function generateReadRecordedJs(): string {
327318

328319
// ── Analysis helpers ───────────────────────────────────────────────────────
329320

330-
function scoreRequest(req: RecordedRequest, arrayResult: ReturnType<typeof findArrayPath> | null): number {
331-
return applyCommonEndpointScoreAdjustments(req, scoreArrayResponse(arrayResult));
332-
}
333-
334321
/** Check whether one recorded request is safe to treat as a write candidate. */
335322
function isWriteCandidate(req: RecordedRequest): boolean {
336323
return ['POST', 'PUT', 'PATCH'].includes(req.method)
@@ -343,24 +330,18 @@ function isWriteCandidate(req: RecordedRequest): boolean {
343330
&& !Array.isArray(req.responseBody);
344331
}
345332

346-
/** Score replayable write requests while keeping tracking and heartbeat traffic suppressed. */
347-
function scoreWriteRequest(req: RecordedRequest): number {
348-
return applyCommonEndpointScoreAdjustments(req, 6);
349-
}
350-
351-
/** Analyze recorded requests into read and write candidates. */
333+
/** Analyze recorded requests into read and write candidates, filtering out noise. */
352334
export function analyzeRecordedRequests(requests: RecordedRequest[]): { candidates: RecordedCandidate[] } {
353335
const candidates: RecordedCandidate[] = [];
354336
for (const req of requests) {
337+
if (isNoiseUrl(req.url)) continue;
355338
const arrayResult = findArrayPath(req.responseBody);
356339
if (isWriteCandidate(req)) {
357-
const score = scoreWriteRequest(req);
358-
if (score > 0) candidates.push({ kind: 'write', req, score, arrayResult: null });
340+
candidates.push({ kind: 'write', req, arrayResult: null });
359341
continue;
360342
}
361343
if (arrayResult) {
362-
const score = scoreRequest(req, arrayResult);
363-
if (score > 0) candidates.push({ kind: 'read', req, score, arrayResult });
344+
candidates.push({ kind: 'read', req, arrayResult });
364345
}
365346
}
366347
return { candidates };
@@ -532,9 +513,9 @@ export function generateRecordedCandidates(
532513
deduped.set(key, current ? preferRecordedCandidate(current, candidate) : candidate);
533514
}
534515

516+
// Sort reads by array item count (richer data first), then take top 5
535517
const selected = [...deduped.values()]
536-
.filter((candidate) => candidate.kind === 'read' ? candidate.score >= 8 : candidate.score >= 6)
537-
.sort((a, b) => b.score - a.score)
518+
.sort((a, b) => (b.arrayResult?.items.length ?? 0) - (a.arrayResult?.items.length ?? 0))
538519
.slice(0, 5);
539520

540521
const usedNames = new Set<string>();
@@ -741,14 +722,14 @@ function analyzeAndWrite(
741722
const candidates: RecordResult['candidates'] = [];
742723
const usedNames = new Set<string>();
743724

744-
console.log(chalk.bold('\n Captured endpoints (scored):\n'));
725+
console.log(chalk.bold('\n Captured endpoints:\n'));
745726

746-
for (const entry of analysis.candidates.sort((a, b) => b.score - a.score).slice(0, 8)) {
727+
for (const entry of analysis.candidates.sort((a, b) => (b.arrayResult?.items.length ?? 0) - (a.arrayResult?.items.length ?? 0)).slice(0, 8)) {
747728
const itemCount = entry.arrayResult?.items.length ?? 0;
748729
const strategy = entry.kind === 'write'
749730
? 'cookie'
750731
: inferStrategy(detectAuthFromContent(entry.req.url, entry.req.responseBody));
751-
const marker = entry.score >= 15 ? chalk.green('★') : entry.score >= 8 ? chalk.yellow('◆') : chalk.dim('·');
732+
const marker = entry.kind === 'write' ? chalk.magenta('✎') : itemCount > 5 ? chalk.green('★') : chalk.dim('·');
752733
console.log(
753734
` ${marker} ${chalk.white(urlToPattern(entry.req.url))}` +
754735
chalk.dim(` [${strategy}]`) +
@@ -777,7 +758,7 @@ function analyzeAndWrite(
777758
}
778759

779760
if (candidates.length === 0) {
780-
console.log(chalk.yellow(' No high-confidence candidates found.'));
761+
console.log(chalk.yellow(' No candidates found.'));
781762
console.log(chalk.dim(' Tip: make sure you triggered JSON API calls (open lists, search, scroll).'));
782763
}
783764

src/synthesize.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ export interface SynthesizeCapability {
2626
name: string;
2727
description: string;
2828
strategy: string;
29-
confidence?: number;
3029
endpoint?: string;
3130
itemPath?: string | null;
3231
recommendedColumns?: string[];
@@ -67,7 +66,6 @@ export interface SynthesizeCandidateSummary {
6766
name: string;
6867
path: string;
6968
strategy: string;
70-
confidence?: number;
7169
}
7270

7371
export interface SynthesizeResult {
@@ -98,7 +96,6 @@ export function synthesizeFromExplore(
9896

9997
const site = bundle.manifest.site;
10098
const capabilities = (bundle.capabilities ?? [])
101-
.sort((a, b) => (b.confidence ?? 0) - (a.confidence ?? 0))
10299
.slice(0, opts.top ?? 3);
103100
const candidates: SynthesizeCandidateSummary[] = [];
104101

@@ -108,7 +105,7 @@ export function synthesizeFromExplore(
108105
const candidate = buildCandidateYaml(site, bundle.manifest, cap, endpoint);
109106
const filePath = path.join(targetDir, `${candidate.name}.yaml`);
110107
fs.writeFileSync(filePath, yaml.dump(candidate.yaml, { sortKeys: false, lineWidth: 120 }));
111-
candidates.push({ name: candidate.name, path: filePath, strategy: cap.strategy, confidence: cap.confidence });
108+
candidates.push({ name: candidate.name, path: filePath, strategy: cap.strategy });
112109
}
113110

114111
const index = { site, target_url: bundle.manifest.target_url, generated_from: exploreDir, candidate_count: candidates.length, candidates };
@@ -119,7 +116,7 @@ export function synthesizeFromExplore(
119116

120117
export function renderSynthesizeSummary(result: SynthesizeResult): string {
121118
const lines = ['opencli synthesize: OK', `Site: ${result.site}`, `Source: ${result.explore_dir}`, `Candidates: ${result.candidate_count}`];
122-
for (const c of result.candidates ?? []) lines.push(` • ${c.name} (${c.strategy}, ${((c.confidence ?? 0) * 100).toFixed(0)}% confidence) → ${c.path}`);
119+
for (const c of result.candidates ?? []) lines.push(` • ${c.name} (${c.strategy}) → ${c.path}`);
123120
return lines.join('\n');
124121
}
125122

@@ -147,7 +144,12 @@ function chooseEndpoint(cap: SynthesizeCapability, endpoints: ExploreEndpointArt
147144
const match = endpoints.find((endpoint) => endpoint.pattern === endpointPattern || endpoint.url?.includes(endpointPattern));
148145
if (match) return match;
149146
}
150-
return [...endpoints].sort((a, b) => (b.score ?? 0) - (a.score ?? 0))[0];
147+
// Fallback: prefer endpoint with most data (item count + detected fields)
148+
return [...endpoints].sort((a, b) => {
149+
const aKey = (a.itemCount ?? 0) * 10 + Object.keys(a.detectedFields ?? {}).length;
150+
const bKey = (b.itemCount ?? 0) * 10 + Object.keys(b.detectedFields ?? {}).length;
151+
return bKey - aKey;
152+
})[0];
151153
}
152154

153155
// ── URL templating ─────────────────────────────────────────────────────────

0 commit comments

Comments
 (0)