|
| 1 | +#!/usr/bin/env node |
| 2 | +// 给 tech-links.json 里"有官网但 Wappalyzer 没收图标"的 tech 自动抓官网 favicon, |
| 3 | +// 保存到 build-scripts/custom-icons/<slug>.{svg,png,ico},随后被 extract-wappalyzer-icons.mjs 复制到 public/skills/ |
| 4 | +// |
| 5 | +// 工作流: |
| 6 | +// 1. 读 tech-links.json 拿所有 tech 名 → 官网 URL 映射 |
| 7 | +// 2. 跳过 skills-index 已经有图标的 slug |
| 8 | +// 3. 跳过黑名单域名(github/npm/wordpress.org/drupal.org 等"非品牌官网"位置) |
| 9 | +// 4. 抓 HTML head 找最大 favicon:SVG → apple-touch-icon → 32+ PNG → /favicon.ico |
| 10 | +// 5. 下载保存 |
| 11 | +// |
| 12 | +// 用法: |
| 13 | +// node build-scripts/fetch-tech-favicons.mjs # 全量 |
| 14 | +// node build-scripts/fetch-tech-favicons.mjs --limit=50 # 只跑前 50 个 |
| 15 | + |
| 16 | +import fs from 'node:fs' |
| 17 | +import path from 'node:path' |
| 18 | +import { fileURLToPath } from 'node:url' |
| 19 | + |
| 20 | +const __dirname = path.dirname(fileURLToPath(import.meta.url)) |
| 21 | +const repoRoot = path.resolve(__dirname, '..') |
| 22 | +const TECH_LINKS_PATH = path.join(repoRoot, 'public', 'tech-links.json') |
| 23 | +const MANIFEST_PATH = path.join(repoRoot, 'src', 'ui', 'components', 'skills-index.json') |
| 24 | +const OUTPUT_DIR = path.join(__dirname, 'custom-icons') |
| 25 | + |
| 26 | +const CONCURRENCY = 8 |
| 27 | +const TIMEOUT_MS = 6000 |
| 28 | +const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' |
| 29 | + |
| 30 | +const args = process.argv.slice(2) |
| 31 | +const limit = Number(args.find(a => a.startsWith('--limit='))?.split('=')[1] || 0) |
| 32 | +const force = args.includes('--force') |
| 33 | +const onlyName = args.find(a => a.startsWith('--only='))?.split('=')[1] || '' |
| 34 | + |
| 35 | +// 这些域名是"非品牌官网"集散地,即使 tech-links 里指向它们也不能拿来当品牌 logo |
| 36 | +const DOMAIN_BLOCKLIST = new Set([ |
| 37 | + 'github.com', |
| 38 | + 'gitlab.com', |
| 39 | + 'bitbucket.org', |
| 40 | + 'codeberg.org', |
| 41 | + 'sourceforge.net', |
| 42 | + 'npmjs.com', |
| 43 | + 'www.npmjs.com', |
| 44 | + 'wordpress.org', |
| 45 | + 'drupal.org', |
| 46 | + 'packagist.org', |
| 47 | + 'cdnjs.com', |
| 48 | + 'unpkg.com', |
| 49 | + 'jsdelivr.com', |
| 50 | + 'cdn.jsdelivr.net', |
| 51 | + 'yarnpkg.com', |
| 52 | + 'pypi.org', |
| 53 | + 'mvnrepository.com', |
| 54 | + 'nuget.org', |
| 55 | + 'rubygems.org', |
| 56 | + 'crates.io', |
| 57 | + 'hex.pm', |
| 58 | + 'pkg.go.dev', |
| 59 | + 'pub.dev', |
| 60 | + 'developer.mozilla.org', |
| 61 | + 'w3.org', |
| 62 | + 'www.w3.org', |
| 63 | + 'spec.whatwg.org', |
| 64 | + 'tc39.es', |
| 65 | + 'caniuse.com', |
| 66 | + 'web.dev', |
| 67 | + 'developers.google.com', |
| 68 | + 'docs.microsoft.com', |
| 69 | + 'learn.microsoft.com', |
| 70 | + 'docs.aws.amazon.com' |
| 71 | +]) |
| 72 | + |
| 73 | +// 我们规则里有些 name 不是品牌名(协议/规范类),用官网 favicon 没意义 |
| 74 | +const NAME_SKIP = new Set([ |
| 75 | + 'HTTPS', |
| 76 | + 'HTTP/2', |
| 77 | + 'HTTP/3', |
| 78 | + 'JavaScript', |
| 79 | + 'WebSocket', |
| 80 | + 'WebAssembly', |
| 81 | + 'ES Modules', |
| 82 | + 'PWA Manifest', |
| 83 | + 'Service Worker' |
| 84 | +]) |
| 85 | + |
| 86 | +const normalize = raw => |
| 87 | + String(raw || '') |
| 88 | + .toLowerCase() |
| 89 | + .replace(/\./g, 'dot') |
| 90 | + .replace(/\+/g, 'plus') |
| 91 | + .replace(/&/g, 'and') |
| 92 | + .replace(/[^a-z0-9一-龥]/g, '') |
| 93 | + |
| 94 | +const primaryName = name => |
| 95 | + String(name || '') |
| 96 | + .split(/\s*\/\s*/)[0] |
| 97 | + .trim() || String(name || '').trim() |
| 98 | + |
| 99 | +const slugFromName = name => normalize(primaryName(name)) |
| 100 | + |
| 101 | +const sleep = ms => new Promise(r => setTimeout(r, ms)) |
| 102 | + |
| 103 | +// 拉取带超时,签名 200 |
| 104 | +const fetchWithTimeout = async (url, init = {}) => { |
| 105 | + const ctrl = new AbortController() |
| 106 | + const timer = setTimeout(() => ctrl.abort(), TIMEOUT_MS) |
| 107 | + try { |
| 108 | + const res = await fetch(url, { |
| 109 | + ...init, |
| 110 | + signal: ctrl.signal, |
| 111 | + redirect: 'follow', |
| 112 | + headers: { 'User-Agent': USER_AGENT, Accept: '*/*', ...(init.headers || {}) } |
| 113 | + }) |
| 114 | + return res |
| 115 | + } finally { |
| 116 | + clearTimeout(timer) |
| 117 | + } |
| 118 | +} |
| 119 | + |
| 120 | +const parseSizes = sizes => { |
| 121 | + if (!sizes) return 0 |
| 122 | + const m = /(\d+)x(\d+)/i.exec(String(sizes)) |
| 123 | + return m ? Math.min(Number(m[1]), Number(m[2])) : 0 |
| 124 | +} |
| 125 | + |
| 126 | +// 从 HTML head 找最大 favicon 候选 |
| 127 | +const extractFaviconCandidates = (html, baseUrl) => { |
| 128 | + const candidates = [] |
| 129 | + const linkPattern = /<link\b[^>]+>/gi |
| 130 | + let m |
| 131 | + while ((m = linkPattern.exec(html))) { |
| 132 | + const tag = m[0] |
| 133 | + const rel = /\brel\s*=\s*["']([^"']+)["']/i.exec(tag)?.[1] || '' |
| 134 | + const href = /\bhref\s*=\s*["']([^"']+)["']/i.exec(tag)?.[1] || '' |
| 135 | + const type = /\btype\s*=\s*["']([^"']+)["']/i.exec(tag)?.[1] || '' |
| 136 | + const sizes = /\bsizes\s*=\s*["']([^"']+)["']/i.exec(tag)?.[1] || '' |
| 137 | + if (!href) continue |
| 138 | + const lowerRel = rel.toLowerCase() |
| 139 | + if (!/(icon|apple-touch-icon|mask-icon)/.test(lowerRel)) continue |
| 140 | + let abs |
| 141 | + try { |
| 142 | + abs = new URL(href, baseUrl).toString() |
| 143 | + } catch { |
| 144 | + continue |
| 145 | + } |
| 146 | + const isSvg = /\.svg(?:$|[?#])/i.test(abs) || /svg/i.test(type) |
| 147 | + const isApple = /apple-touch-icon/.test(lowerRel) |
| 148 | + const size = parseSizes(sizes) |
| 149 | + // 评分:SVG > apple-touch > 大 size > 普通 icon |
| 150 | + let score = 0 |
| 151 | + if (isSvg) score += 1000 |
| 152 | + if (isApple) score += 300 |
| 153 | + score += size |
| 154 | + candidates.push({ url: abs, score, type }) |
| 155 | + } |
| 156 | + return candidates.sort((a, b) => b.score - a.score) |
| 157 | +} |
| 158 | + |
| 159 | +const extFromUrlAndType = (url, contentType) => { |
| 160 | + if (/\.svg(?:$|[?#])/i.test(url) || /svg/i.test(contentType)) return 'svg' |
| 161 | + if (/\.png(?:$|[?#])/i.test(url) || /png/i.test(contentType)) return 'png' |
| 162 | + if (/\.ico(?:$|[?#])/i.test(url) || /x-icon|icon/i.test(contentType)) return 'ico' |
| 163 | + if (/\.jpe?g(?:$|[?#])/i.test(url) || /jpe?g/i.test(contentType)) return 'png' // 用 .png 后缀容纳 jpeg(浏览器 img 会自动识别) |
| 164 | + return '' |
| 165 | +} |
| 166 | + |
| 167 | +const fetchIcon = async (techName, websiteUrl) => { |
| 168 | + let origin |
| 169 | + try { |
| 170 | + origin = new URL(websiteUrl).origin |
| 171 | + } catch { |
| 172 | + return { ok: false, reason: 'invalid-url' } |
| 173 | + } |
| 174 | + // 1. 抓 origin HTML,从 <link rel="icon"> 找最大候选(就算 HTML 拉不下来,也兜底直接打 /favicon.ico) |
| 175 | + let html = '' |
| 176 | + try { |
| 177 | + const res = await fetchWithTimeout(origin, {}) |
| 178 | + if (res.ok) html = await res.text() |
| 179 | + } catch { |
| 180 | + // 忽略,继续走 fallback |
| 181 | + } |
| 182 | + const candidates = extractFaviconCandidates(html, origin) |
| 183 | + // 2. fallback:apple-touch-icon / favicon.ico 几个标准位置 |
| 184 | + candidates.push({ url: new URL('/apple-touch-icon.png', origin).toString(), score: 200, type: '' }) |
| 185 | + candidates.push({ url: new URL('/favicon.svg', origin).toString(), score: 100, type: '' }) |
| 186 | + candidates.push({ url: new URL('/favicon.ico', origin).toString(), score: 0, type: '' }) |
| 187 | + |
| 188 | + const tried = [] |
| 189 | + for (const cand of candidates) { |
| 190 | + try { |
| 191 | + const res = await fetchWithTimeout(cand.url, {}) |
| 192 | + if (!res.ok) { |
| 193 | + tried.push(`${cand.url}=${res.status}`) |
| 194 | + continue |
| 195 | + } |
| 196 | + const buf = Buffer.from(await res.arrayBuffer()) |
| 197 | + const contentType = res.headers.get('content-type') || '' |
| 198 | + if (buf.length < 64) { |
| 199 | + tried.push(`${cand.url}=tooSmall(${buf.length})`) |
| 200 | + continue |
| 201 | + } |
| 202 | + if (buf.length > 512 * 1024) { |
| 203 | + tried.push(`${cand.url}=tooLarge(${buf.length})`) |
| 204 | + continue |
| 205 | + } |
| 206 | + const ext = extFromUrlAndType(cand.url, contentType) |
| 207 | + if (!ext) { |
| 208 | + tried.push(`${cand.url}=unknownExt(ct=${contentType})`) |
| 209 | + continue |
| 210 | + } |
| 211 | + return { ok: true, ext, buf, sourceUrl: cand.url, score: cand.score } |
| 212 | + } catch (e) { |
| 213 | + tried.push(`${cand.url}=err:${e?.name || 'unknown'}`) |
| 214 | + continue |
| 215 | + } |
| 216 | + } |
| 217 | + return { ok: false, reason: 'no-icon-found', tried } |
| 218 | +} |
| 219 | + |
| 220 | +// ---------------- 主流程 ---------------- |
| 221 | + |
| 222 | +const techLinks = JSON.parse(fs.readFileSync(TECH_LINKS_PATH, 'utf8')).links || {} |
| 223 | +const skillsIndex = JSON.parse(fs.readFileSync(MANIFEST_PATH, 'utf8')).skillsIndex || {} |
| 224 | + |
| 225 | +fs.mkdirSync(OUTPUT_DIR, { recursive: true }) |
| 226 | +const existingCustom = new Set(fs.readdirSync(OUTPUT_DIR).map(f => normalize(f.replace(/\.(svg|png|ico)$/i, '')))) |
| 227 | + |
| 228 | +const queue = [] |
| 229 | +for (const [name, url] of Object.entries(techLinks)) { |
| 230 | + if (!url || typeof url !== 'string') continue |
| 231 | + if (NAME_SKIP.has(name)) continue |
| 232 | + if (onlyName && name !== onlyName) continue |
| 233 | + const slug = slugFromName(name) |
| 234 | + if (!slug) continue |
| 235 | + if (!force && skillsIndex[slug]) continue // 已有 Wappalyzer / 之前抓的图标,跳过 |
| 236 | + if (!force && existingCustom.has(slug)) continue |
| 237 | + let host |
| 238 | + try { |
| 239 | + host = new URL(url).host.toLowerCase() |
| 240 | + } catch { |
| 241 | + continue |
| 242 | + } |
| 243 | + if (DOMAIN_BLOCKLIST.has(host)) continue |
| 244 | + queue.push({ name, url, slug }) |
| 245 | +} |
| 246 | + |
| 247 | +console.log( |
| 248 | + `[plan] 候选 tech 数:${queue.length}(总 tech: ${Object.keys(techLinks).length},Wappalyzer 已覆盖: ${Object.keys(skillsIndex).length})` |
| 249 | +) |
| 250 | +const todo = limit > 0 ? queue.slice(0, limit) : queue |
| 251 | +console.log(`[plan] 本次抓:${todo.length} 个(${limit > 0 ? '--limit=' + limit : '全量'})`) |
| 252 | + |
| 253 | +let okCount = 0 |
| 254 | +let failCount = 0 |
| 255 | +const fails = [] |
| 256 | + |
| 257 | +const runBatch = async batch => { |
| 258 | + const results = await Promise.all( |
| 259 | + batch.map(async item => { |
| 260 | + const r = await fetchIcon(item.name, item.url) |
| 261 | + if (!r.ok) { |
| 262 | + failCount++ |
| 263 | + fails.push({ name: item.name, slug: item.slug, reason: r.reason, tried: r.tried }) |
| 264 | + return |
| 265 | + } |
| 266 | + const filename = item.slug + '.' + r.ext |
| 267 | + fs.writeFileSync(path.join(OUTPUT_DIR, filename), r.buf) |
| 268 | + okCount++ |
| 269 | + console.log(` ✓ ${item.name.padEnd(40)} ${r.ext.padEnd(4)} ${r.buf.length}B ${r.sourceUrl.slice(0, 80)}`) |
| 270 | + }) |
| 271 | + ) |
| 272 | + return results |
| 273 | +} |
| 274 | + |
| 275 | +;(async () => { |
| 276 | + for (let i = 0; i < todo.length; i += CONCURRENCY) { |
| 277 | + const batch = todo.slice(i, i + CONCURRENCY) |
| 278 | + await runBatch(batch) |
| 279 | + if (i % 40 === 0) await sleep(200) // 节流,别太凶 |
| 280 | + } |
| 281 | + console.log(`\n[done] ok=${okCount} fail=${failCount}`) |
| 282 | + if (fails.length && fails.length <= 30) { |
| 283 | + console.log('\n失败明细(前 30):') |
| 284 | + fails.slice(0, 30).forEach(f => { |
| 285 | + console.log(` · ${f.name.padEnd(40)} ${f.reason}`) |
| 286 | + if (f.tried) f.tried.forEach(t => console.log(` ${t}`)) |
| 287 | + }) |
| 288 | + } |
| 289 | + console.log(`\n下一步:跑 \`node build-scripts/extract-wappalyzer-icons.mjs\` 让 custom-icons/ 复制到 public/skills/ 并更新 manifest`) |
| 290 | +})() |
0 commit comments