Skip to content

Commit 6fb3eee

Browse files
committed
feat: bundle 扫描提取包体内嵌 OAuth URL + budget 调优 + 并发与限流
bundle-license.ts 拓展为两路输出:除了原有的 license 注释扫描,新增从 JS body 用正则提取 `https?://.../(oauth|oauth2|authorize|connect|sso|openid-connect|saml|signin|sign-in|login/oauth)` 形态的 URL,跑一遍第三方登录规则匹配。SPA 把 OAuth 回调写在 bundle 里、HTML 里看不到的场景(如 linux.do / GitHub / Google 登录在 new-api 这类站)都能识别,evidence 标为「JS 包体内嵌 OAuth 入口」,source 仍归到 JS 版权注释。 配套调整:bundle 调度延时 SCAN_DELAY_MS 1400→600ms;sample budget MAX_TOTAL_SAMPLE_BYTES 2MB→4MB,确保 5 个候选脚本都能拿到 head 384KB;每脚本 range 采样从 6 段降到 3 段,采样比例减为 [0.25, 0.5, 1];总采样上限 MAX_SCAN_MS 8s→12s,留充足时间。脚本扫描从 for-await 串行改成 3 个一组的 Promise.all 并发批次,5 个脚本从 ~5s 压到 ~1s。fetch 加 `priority: 'low'` 让 Chrome 把扫描请求排到页面资源之后,不抢用户带宽。ScriptLicenseObservation 增加 sourceLength / embeddedAuthUrls 诊断字段。schema 版本随逻辑改动从 4 滚到 7。 将版本号提升到 1.3.58。
1 parent 6593dd5 commit 6fb3eee

2 files changed

Lines changed: 102 additions & 21 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "stackprism",
33
"private": true,
4-
"version": "1.3.57",
4+
"version": "1.3.58",
55
"type": "module",
66
"description": "StackPrism 用于检测网页前端、后端、CDN、SaaS、广告营销、统计、登录、支付、网站程序和主题模板线索。",
77
"scripts": {

src/background/bundle-license.ts

Lines changed: 101 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,27 @@ import { buildEffectivePageRules, loadDetectorSettings, loadTechRules } from './
33
import { mergeTechnologyRecords, shortHeaderUrl } from './merge'
44
import { getTabData, getTabSnapshot, updateBadgeForTab, writeTabData } from './tab-store'
55
import { matchesCompiledRulePatterns, matchesRuleTextHints } from './rule-matcher'
6+
import { withTabWriteLock } from './tab-write-lock'
67
import { isDetectablePageUrl } from '@/utils/page-support'
78
import { cleanTechnologyUrl } from '@/utils/url'
89

9-
const BUNDLE_LICENSE_SCHEMA_VERSION = 4
10+
const BUNDLE_LICENSE_SCHEMA_VERSION = 7
1011
const BUNDLE_LICENSE_SOURCE = 'JS 版权注释'
1112
const MAX_CANDIDATE_SCRIPTS = 5
1213
const MAX_FETCH_BYTES = 384 * 1024
1314
const MAX_RANGE_SAMPLE_BYTES = 160 * 1024
14-
const MAX_TOTAL_SAMPLE_BYTES = 2 * 1024 * 1024
15+
// 5 个候选 × 384KB head = 1.92MB,再加 index 类大文件的 range 采样,budget 设到 4MB 才够
16+
const MAX_TOTAL_SAMPLE_BYTES = 4 * 1024 * 1024
1517
const MIN_SAMPLE_BYTES = 24 * 1024
16-
const MAX_RANGE_SAMPLES_PER_SCRIPT = 6
17-
const MAX_RANGE_SAMPLES_PER_SCAN = 10
18+
// 减到 3:对 OAuth URL / 内嵌 license 来说,首段 384KB 加 3 个尾段采样足够,留预算给其他候选
19+
const MAX_RANGE_SAMPLES_PER_SCRIPT = 3
20+
const MAX_RANGE_SAMPLES_PER_SCAN = 8
1821
const MAX_SIDECAR_BYTES = 160 * 1024
1922
const MAX_LICENSE_TEXT_CHARS = 180_000
2023
const FETCH_TIMEOUT_MS = 6000
21-
const MAX_SCAN_MS = 8000
22-
const SCAN_DELAY_MS = 1400
23-
const RANGE_SAMPLE_RATIOS = [0.25, 0.5, 0.8, 0.835, 0.9, 1] as const
24+
const MAX_SCAN_MS = 12000
25+
const SCAN_DELAY_MS = 600
26+
const RANGE_SAMPLE_RATIOS = [0.25, 0.5, 1] as const
2427

2528
const bundleLicenseTimers = new Map<number, ReturnType<typeof setTimeout>>()
2629

@@ -34,6 +37,11 @@ type ScriptLicenseObservation = {
3437
commentCount: number
3538
text: string
3639
sidecarUrl?: string
40+
// JS 包体里嵌入的 OAuth / SSO / 登录端点 URL:SPA 把这种 URL 写在 bundle 里,
41+
// HTML / 资源列表 / 响应头里看不到,需要单独走第三方登录规则匹配
42+
embeddedAuthUrls?: string[]
43+
// 实际拉到的字节数:用来在 popup 原始线索里诊断脚本是否真的被 fetch 到了
44+
sourceLength?: number
3745
}
3846

3947
type RangeFetchResult = {
@@ -227,8 +235,10 @@ const fetchTextRange = async (url: string, start: number, maxBytes: number, budg
227235
cache: 'force-cache',
228236
credentials: 'omit',
229237
headers: { Range: `bytes=${safeStart}-${end}` },
230-
signal: controller.signal
231-
})
238+
signal: controller.signal,
239+
// 让 Chrome 把 bundle 扫描的请求排到页面资源后面,不抢用户的关键带宽
240+
priority: 'low'
241+
} as RequestInit)
232242
if (!response.ok) return { rangeSupported: false, text: '' }
233243
if (!isTextLikeResponse(url, response)) return { rangeSupported: false, text: '' }
234244

@@ -290,6 +300,23 @@ const trimLicenseText = (text: string): string => {
290300

291301
const looksLikeHtmlDocument = (text: string): boolean => /^\s*(?:<!doctype\s+html|<html[\s>])/i.test(text)
292302

303+
// 匹配 JS 包体里嵌入的 OAuth / SSO 形态 URL,常见于 SPA 里的「使用 X 登录」按钮回调
304+
// 抓到的 URL 会跑一遍第三方登录规则匹配,覆盖 HTML / 资源列表 / 响应头都拿不到的盲区
305+
const EMBEDDED_AUTH_URL_PATTERN =
306+
/https?:\/\/[a-z0-9][a-z0-9.-]{2,}\.[a-z]{2,}\/(?:[^\s'"`<>]*?\/)?(?:oauth2?|authorize|connect|sso|openid[-_/]?connect|saml|signin|sign-in|login\/oauth)\b[^\s'"`<>)\]}]{0,200}/gi
307+
308+
const extractEmbeddedAuthUrls = (source: string): string[] => {
309+
if (!source) return []
310+
const seen = new Set<string>()
311+
for (const match of source.matchAll(EMBEDDED_AUTH_URL_PATTERN)) {
312+
const url = match[0].replace(/[)\];,.}'`"]+$/, '').trim()
313+
if (url.length < 14 || url.length > 240) continue
314+
seen.add(url)
315+
if (seen.size >= 60) break
316+
}
317+
return [...seen]
318+
}
319+
293320
const extractLicenseComments = (source: string): string[] => {
294321
const comments: string[] = []
295322
let commentChars = 0
@@ -336,17 +363,22 @@ const fetchSidecarLicenseText = async (sidecarUrl: string, budget: ScanBudget):
336363

337364
const scanScriptLicense = async (scriptUrl: string, budget: ScanBudget): Promise<ScriptLicenseObservation | null> => {
338365
const source = await fetchSampledScriptText(scriptUrl, budget)
366+
const sourceLength = source?.length || 0
339367
const comments = unique(source ? extractLicenseComments(source) : [])
368+
const embeddedAuthUrls = source ? extractEmbeddedAuthUrls(source) : []
340369
const sidecarUrl = buildSidecarLicenseUrl(scriptUrl)
341370
const sidecarText = sidecarUrl && comments.length < 12 && hasScanBudget(budget) ? await fetchSidecarLicenseText(sidecarUrl, budget) : ''
342371
const text = trimLicenseText([...comments, sidecarText].filter(Boolean).join('\n\n'))
343372

344-
if (!text) return null
373+
// 没拿到任何字节也保留 observation,方便用户在原始线索里看到「fetch 失败」而不是悄无声息
374+
if (!sourceLength && !text && !embeddedAuthUrls.length) return null
345375
return {
346376
url: scriptUrl,
347377
commentCount: comments.length + (sidecarText ? 1 : 0),
348378
text,
349-
sidecarUrl: sidecarText ? sidecarUrl : undefined
379+
sidecarUrl: sidecarText ? sidecarUrl : undefined,
380+
embeddedAuthUrls: embeddedAuthUrls.length ? embeddedAuthUrls : undefined,
381+
sourceLength: sourceLength || undefined
350382
}
351383
}
352384

@@ -373,12 +405,48 @@ const detectTechnologiesFromLicenseText = (observations: ScriptLicenseObservatio
373405
return mergeTechnologyRecords(technologies).slice(0, 80)
374406
}
375407

408+
// 用第三方登录规则跑一遍 bundle 里抓出的 OAuth URL:
409+
// SPA 把「使用 linux.do / GitHub / Google 登录」的回调 URL 写在 bundle 里时
410+
// HTML / 资源 URL / 响应头都看不到,只能扫包体抓 URL 后再走规则匹配
411+
const detectAuthProvidersFromBundles = (observations: ScriptLicenseObservation[], rules: any[]): any[] => {
412+
if (!Array.isArray(rules) || !rules.length || !observations.length) return []
413+
414+
const technologies: any[] = []
415+
for (const observation of observations) {
416+
if (!observation.embeddedAuthUrls?.length) continue
417+
const urlBlob = observation.embeddedAuthUrls.join('\n')
418+
for (const rule of rules) {
419+
if (!rule?.name) continue
420+
if (!matchesCompiledRulePatterns(rule, urlBlob)) continue
421+
const matched = observation.embeddedAuthUrls.find(u => matchesCompiledRulePatterns(rule, u)) || observation.embeddedAuthUrls[0]
422+
const kindPrefix = rule.kind ? `${rule.kind}:` : ''
423+
technologies.push({
424+
category: rule.category || '第三方登录 / OAuth',
425+
name: rule.name,
426+
confidence: rule.confidence || '高',
427+
evidence: [`${kindPrefix}JS 包体内嵌 OAuth 入口 ${shortHeaderUrl(matched)}`],
428+
source: BUNDLE_LICENSE_SOURCE,
429+
url: cleanTechnologyUrl(rule.url)
430+
})
431+
}
432+
}
433+
434+
return mergeTechnologyRecords(technologies).slice(0, 40)
435+
}
436+
376437
const saveBundleLicenseDataAndBadge = async (tabId: number, data: any, settings: any, tab: any) => {
377438
if (!isDetectablePageUrl(tab?.url)) return
378-
const popup = await buildPopupCacheRecord(data, settings, tab)
379-
const { popup: _legacyPopup, ...tabData } = data || {}
380-
await writeTabData(tabId, tabData, popup)
381-
await updateBadgeForTab(tabId, popup)
439+
// 走 per-tab 写锁:bundle 扫描跑 1-2s,期间 detection / dynamic / headers 都可能在并发写;
440+
// 进入锁后再 re-read 最新 storage,只覆盖自己的 bundle 字段,其他字段保留最新
441+
await withTabWriteLock(tabId, async () => {
442+
const latest = (await getTabData(tabId)) || {}
443+
latest.bundle = data.bundle
444+
latest.updatedAt = data.updatedAt || Date.now()
445+
const popup = await buildPopupCacheRecord(latest, settings, tab)
446+
const { popup: _legacyPopup, ...tabData } = latest
447+
await writeTabData(tabId, tabData, popup)
448+
await updateBadgeForTab(tabId, popup)
449+
})
382450
}
383451

384452
export const runBundleLicenseDetection = async (tabId: number): Promise<void> => {
@@ -395,15 +463,26 @@ export const runBundleLicenseDetection = async (tabId: number): Promise<void> =>
395463
if (data.bundle?.schemaVersion === BUNDLE_LICENSE_SCHEMA_VERSION && data.bundle?.signature === signature) return
396464

397465
const budget = createScanBudget()
466+
// 并发扫候选脚本,但限制同时 fetch 数为 3:完全并行会一次性占 5 条网络连接,
467+
// 跟页面自身资源抢带宽;3 个一组既能压住扫描总耗时,又不挤占用户的页面加载。
468+
const SCRIPT_SCAN_CONCURRENCY = 3
398469
const observations: ScriptLicenseObservation[] = []
399-
for (const script of scripts) {
470+
for (let i = 0; i < scripts.length; i += SCRIPT_SCAN_CONCURRENCY) {
400471
if (!hasScanBudget(budget)) break
401-
const observation = await scanScriptLicense(script, budget)
402-
if (observation) observations.push(observation)
472+
const batch = scripts.slice(i, i + SCRIPT_SCAN_CONCURRENCY)
473+
const batchResults = await Promise.all(
474+
batch.map(script => (hasScanBudget(budget) ? scanScriptLicense(script, budget) : Promise.resolve(null)))
475+
)
476+
for (const observation of batchResults) {
477+
if (observation) observations.push(observation)
478+
}
403479
await yieldToEventLoop()
404480
}
405481

406-
const technologies = detectTechnologiesFromLicenseText(observations, pageRules.bundleLicenseLibraries || [])
482+
const technologies = mergeTechnologyRecords([
483+
...detectTechnologiesFromLicenseText(observations, pageRules.bundleLicenseLibraries || []),
484+
...detectAuthProvidersFromBundles(observations, pageRules.thirdPartyLogins || [])
485+
])
407486
const pageIdentity = getBundlePageIdentity(data, tab)
408487

409488
data.bundle = {
@@ -415,7 +494,9 @@ export const runBundleLicenseDetection = async (tabId: number): Promise<void> =>
415494
scripts: observations.map(observation => ({
416495
url: observation.url,
417496
sidecarUrl: observation.sidecarUrl || '',
418-
commentCount: observation.commentCount
497+
commentCount: observation.commentCount,
498+
sourceLength: observation.sourceLength || 0,
499+
embeddedAuthUrls: observation.embeddedAuthUrls || []
419500
})),
420501
technologies
421502
}

0 commit comments

Comments
 (0)