Skip to content

Commit 3bfb25e

Browse files
authored
Merge pull request #1447 from heygen-com/feat/cli-capture-video
feat(cli): capture-video on-demand fetcher + capture pipeline robustness
2 parents b8fa4b5 + 6a024a3 commit 3bfb25e

10 files changed

Lines changed: 913 additions & 30 deletions

File tree

packages/cli/src/capture/assetCataloger.ts

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ export interface CatalogedAsset {
2525
sectionClasses?: string;
2626
/** Whether the image is above the fold (visible without scrolling) */
2727
aboveFold?: boolean;
28+
/** Element sits inside <header>, <nav>, or [role="banner"] — logo signal */
29+
inBanner?: boolean;
30+
/** Element sits inside <a> with site-root href ("/", "#", origin-only) — brand-home link */
31+
inHomeLink?: boolean;
32+
/** alt/aria-label/title contains the brand segment of document.title */
33+
matchesTitleBrand?: boolean;
2834
}
2935

3036
/**
@@ -62,6 +68,26 @@ export async function catalogAssets(page: Page): Promise<CatalogedAsset[]> {
6268
var rect = el.getBoundingClientRect();
6369
ctx.aboveFold = rect.top < window.innerHeight;
6470
} catch(e) {}
71+
// Structural logo-candidate signals: class-substring alone caught 0/32 SVGs on heygen.com.
72+
ctx.inBanner = el.closest('header, nav, [role="banner"]') !== null;
73+
var homeAnchor = el.closest('a[href]');
74+
if (homeAnchor) {
75+
var aHref = homeAnchor.getAttribute('href') || '';
76+
ctx.inHomeLink = aHref === '/' || aHref === '#' || aHref === './' ||
77+
/^https?:\\/\\/[^/]+\\/?$/.test(aHref);
78+
}
79+
// Brand can be first ("HeyGen - Ideas"), last ("Ideas - HeyGen"), or colon-separated ("Vercel: Build").
80+
var titleParts = (document.title || '').split(/[-|—:]/);
81+
if (desc) {
82+
for (var ti = 0; ti < titleParts.length; ti++) {
83+
var part = titleParts[ti].trim();
84+
if (part.length > 1 && part.length < 30 &&
85+
desc.toLowerCase().indexOf(part.toLowerCase()) !== -1) {
86+
ctx.matchesTitleBrand = true;
87+
break;
88+
}
89+
}
90+
}
6591
return ctx;
6692
}
6793
@@ -92,12 +118,15 @@ export async function catalogAssets(page: Page): Promise<CatalogedAsset[]> {
92118
if (notes && !entry.notes) {
93119
entry.notes = notes;
94120
}
95-
// Merge rich context (first one wins)
121+
// Text fields: first-occurrence wins. Boolean signals: any positive sample wins.
96122
if (richCtx) {
97123
if (richCtx.description && !entry.description) entry.description = richCtx.description;
98124
if (richCtx.nearestHeading && !entry.nearestHeading) entry.nearestHeading = richCtx.nearestHeading;
99125
if (richCtx.sectionClasses && !entry.sectionClasses) entry.sectionClasses = richCtx.sectionClasses;
100126
if (richCtx.aboveFold !== undefined && entry.aboveFold === undefined) entry.aboveFold = richCtx.aboveFold;
127+
if (richCtx.inBanner) entry.inBanner = true;
128+
if (richCtx.inHomeLink) entry.inHomeLink = true;
129+
if (richCtx.matchesTitleBrand) entry.matchesTitleBrand = true;
101130
}
102131
}
103132
@@ -324,6 +353,9 @@ function deduplicateSrcsetVariants(assets: CatalogedAsset[]): CatalogedAsset[] {
324353
if (a.notes && !existing.notes) {
325354
existing.notes = a.notes;
326355
}
356+
if (a.inBanner) existing.inBanner = true;
357+
if (a.inHomeLink) existing.inHomeLink = true;
358+
if (a.matchesTitleBrand) existing.matchesTitleBrand = true;
327359
// Keep the URL with highest w= value (largest image)
328360
const existingW = getWidthParam(existing.url);
329361
const newW = getWidthParam(a.url);

packages/cli/src/capture/assetDownloader.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,16 @@
77

88
import { writeFileSync, mkdirSync } from "node:fs";
99
import { join, extname } from "node:path";
10+
import { createHash } from "node:crypto";
1011
import type { DesignTokens, DownloadedAsset } from "./types.js";
1112
import type { CatalogedAsset } from "./assetCataloger.js";
1213

14+
// SVGs: hash-of-bytes filename so it can't drift from content; label-derived names mis-assigned brands.
15+
function svgContentHashSlug(svgSource: string | Buffer, isLogo: boolean): string {
16+
const hash = createHash("sha1").update(svgSource).digest("hex").slice(0, 8);
17+
return isLogo ? `logo-${hash}` : `svg-${hash}`;
18+
}
19+
1320
export async function downloadAssets(
1421
tokens: DesignTokens,
1522
outputDir: string,
@@ -22,15 +29,12 @@ export async function downloadAssets(
2229
const assets: DownloadedAsset[] = [];
2330
const downloadedUrls = new Set<string>();
2431

25-
// 1. ALL inline SVGs — save as files (logos get priority naming)
2632
mkdirSync(join(outputDir, "assets", "svgs"), { recursive: true });
2733
const usedSvgNames = new Set<string>();
2834
for (let i = 0; i < tokens.svgs.length && i < 30; i++) {
2935
const svg = tokens.svgs[i]!;
3036
if (!svg.outerHTML || svg.outerHTML.length < 50) continue;
31-
const label = svg.label?.replace(/[^a-zA-Z0-9-_ ]/g, "").trim();
32-
let slug = label ? slugify(label) : svg.isLogo ? `logo-${i}` : `icon-${i}`;
33-
// Deduplicate — two SVGs with same aria-label get suffixed
37+
const slug = svgContentHashSlug(svg.outerHTML, !!svg.isLogo);
3438
let finalSlug = slug;
3539
let suffix = 2;
3640
while (usedSvgNames.has(finalSlug)) {
@@ -135,8 +139,23 @@ export async function downloadAssets(
135139
if (result.status !== "fulfilled" || !result.value) continue;
136140
const { url, isPoster, parsedUrl, ext, buffer, catalog } = result.value;
137141
try {
138-
// Generate human-readable name from catalog context
139-
const slug = deriveAssetName(parsedUrl, catalog, isPoster, imgIdx, usedNames);
142+
let slug: string;
143+
if (ext === ".svg") {
144+
const c = catalog;
145+
const brandRe = /logo|brand|wordmark/i;
146+
const isLogo = !!(
147+
c?.inBanner ||
148+
c?.inHomeLink ||
149+
c?.matchesTitleBrand ||
150+
c?.contexts?.some((s) => brandRe.test(s)) ||
151+
(c?.description && brandRe.test(c.description)) ||
152+
(c?.nearestHeading && brandRe.test(c.nearestHeading)) ||
153+
(c?.sectionClasses && brandRe.test(c.sectionClasses))
154+
);
155+
slug = svgContentHashSlug(buffer, isLogo);
156+
} else {
157+
slug = deriveAssetName(parsedUrl, catalog, isPoster, imgIdx, usedNames);
158+
}
140159
const name = `${slug}${ext}`;
141160
usedNames.add(slug);
142161
const localPath = `assets/${name}`;

packages/cli/src/capture/contentExtractor.ts

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import type { Page } from "puppeteer-core";
1212
import { existsSync, readdirSync, statSync, readFileSync } from "node:fs";
1313
import { join } from "node:path";
14+
import sharp from "sharp";
1415
import type { CatalogedAsset } from "./assetCataloger.js";
1516
import type { DesignTokens } from "./types.js";
1617

@@ -232,7 +233,7 @@ export async function captionImagesWithGemini(
232233
}
233234
progress("design", `${Object.keys(geminiCaptions).length} images captioned with Gemini`);
234235

235-
// Caption SVGs by sending source code as text (vision API rejects image/svg+xml).
236+
// Rasterize SVGs to PNG before captioning — Vision hallucinates wordmarks when reading SVG path text.
236237
const svgFiles: Array<{ file: string; relPath: string }> = [];
237238
const assetsDir = join(outputDir, "assets");
238239
for (const f of readdirSync(assetsDir)) {
@@ -246,30 +247,59 @@ export async function captionImagesWithGemini(
246247
}
247248

248249
if (svgFiles.length > 0) {
249-
progress("design", `Captioning ${svgFiles.length} SVGs via code analysis...`);
250+
progress("design", `Rasterizing + captioning ${svgFiles.length} SVGs via vision API...`);
250251
const SVG_BATCH = 20;
251-
const MAX_SVG_CHARS = 10_000;
252+
const SVG_RENDER_SIZE = 256; // px — enough resolution for Gemini to read wordmarks, small enough to keep payload sub-MB
253+
let svgsSkipped = 0;
252254
for (let i = 0; i < svgFiles.length; i += SVG_BATCH) {
253255
const batch = svgFiles.slice(i, i + SVG_BATCH);
254256
const results = await Promise.allSettled(
255257
batch.map(async ({ relPath }) => {
256258
const filePath = join(assetsDir, relPath);
257-
let svgText = readFileSync(filePath, "utf-8");
258-
if (svgText.length > MAX_SVG_CHARS) {
259-
svgText = svgText.slice(0, MAX_SVG_CHARS) + "\n<!-- truncated -->";
259+
let pngBase64: string;
260+
try {
261+
// Flatten against a contrasting background — white-on-white SVGs render invisible to Vision.
262+
const svgSource = readFileSync(filePath, "utf-8");
263+
const lightFillHits = (
264+
svgSource.match(/fill\s*=\s*["'](#fff(fff)?|white|#[ef][ef][ef]|#[ef]{6})["']/gi) ||
265+
[]
266+
).length;
267+
const darkFillHits = (
268+
svgSource.match(/fill\s*=\s*["'](#000(000)?|black|#[0-3]{6}|#[0-3]{3})["']/gi) || []
269+
).length;
270+
const bg =
271+
lightFillHits > darkFillHits
272+
? { r: 32, g: 32, b: 32 } // dark slate behind light glyphs
273+
: { r: 255, g: 255, b: 255 }; // white behind dark glyphs (default)
274+
const pngBuffer = await sharp(filePath)
275+
.resize({
276+
width: SVG_RENDER_SIZE,
277+
height: SVG_RENDER_SIZE,
278+
fit: "inside",
279+
withoutEnlargement: false,
280+
})
281+
.flatten({ background: bg })
282+
.png()
283+
.toBuffer();
284+
pngBase64 = pngBuffer.toString("base64");
285+
} catch {
286+
// exotic SVG features may break sharp; skip caption rather than block
287+
svgsSkipped++;
288+
return { file: relPath, caption: "" };
260289
}
261290
const response = await ai.models.generateContent({
262291
model,
263292
contents: [
264293
{
265294
role: "user",
266295
parts: [
296+
{ inlineData: { mimeType: "image/png", data: pngBase64 } },
267297
{
268298
text:
269-
"This SVG code is from a website. Describe what it renders in ONE short sentence " +
270-
"for a video storyboard. Focus on: what shape/icon/illustration it is, its colors. " +
271-
"Be factual.\n\n" +
272-
svgText,
299+
"Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
300+
"Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
301+
"If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
302+
"Be factual.",
273303
},
274304
],
275305
},
@@ -293,6 +323,12 @@ export async function captionImagesWithGemini(
293323
);
294324
}
295325
progress("design", `${Object.keys(geminiCaptions).length} total assets captioned`);
326+
if (svgsSkipped > 0) {
327+
progress(
328+
"design",
329+
`skipped rasterizing ${svgsSkipped} SVG(s) — fell back to label-derived`,
330+
);
331+
}
296332
}
297333
} catch (err) {
298334
warnings.push(`Gemini captioning failed: ${err}`);
@@ -358,11 +394,6 @@ export function generateAssetDescriptions(
358394
const svgsPath = join(assetsPath, "svgs");
359395
for (const file of readdirSync(svgsPath)) {
360396
if (!file.endsWith(".svg")) continue;
361-
const geminiCaption = geminiCaptions[`svgs/${file}`];
362-
if (geminiCaption) {
363-
svgLines.push(`svgs/${file}${geminiCaption}`);
364-
continue;
365-
}
366397
const svgMatch = tokens.svgs.find(
367398
(s) =>
368399
s.label &&
@@ -373,9 +404,13 @@ export function generateAssetDescriptions(
373404
.slice(0, 15),
374405
),
375406
);
407+
const geminiCaption = geminiCaptions[`svgs/${file}`];
408+
if (geminiCaption) {
409+
svgLines.push(`svgs/${file}${geminiCaption}`);
410+
continue;
411+
}
376412
const label = svgMatch?.label || file.replace(".svg", "").replace(/-/g, " ");
377-
const isLogo = svgMatch?.isLogo || file.includes("logo");
378-
svgLines.push(`svgs/${file}${isLogo ? "logo: " : "icon: "}${label}`);
413+
svgLines.push(`svgs/${file}${label}`);
379414
}
380415
} catch {
381416
/* no svgs dir */

packages/cli/src/capture/index.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,14 +579,19 @@ export async function captureWebsite(
579579
const lines = generateAssetDescriptions(outputDir, tokens, catalogedAssets, geminiCaptions);
580580

581581
if (lines.length > 0) {
582+
const hasGeminiKey = !!(process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY);
583+
const header = hasGeminiKey
584+
? "# Asset Descriptions\n\nOne line per file. Read this instead of opening every image individually.\n\nTo find a specific brand or icon, **grep this file for the brand name in the description text** (e.g. `grep -i 'autodesk' asset-descriptions.md`). The Gemini Vision captions identify what's actually in each file — that's the agent's selector.\n\nThe `logo-<hash>.svg` filename prefix is a cheap structural hint (DOM said this SVG was inside a `<header>`, home-link `<a>`, or had an aria-label matching the page brand). It is NOT a content claim — many `logo-*` files are nav icons or decorative shapes. Trust the captions, not the filename prefix.\n\n"
585+
: "# Asset Descriptions\n\n⚠️ GEMINI_API_KEY not set — descriptions below are catalog-derived (alt text, headings, section context, filename) instead of Vision-generated. To get richer Vision descriptions on the next capture, set GEMINI_API_KEY (or GOOGLE_API_KEY) and re-run.\n\nThe `logo-<hash>.svg` filename prefix is a structural hint (DOM said this SVG was inside a `<header>`, home-link `<a>`, or had an aria-label matching the page brand). To pick the actual brand logo without Vision, open the `logo-*` candidates in a previewer or rasterize them with `sharp` before referencing — composing a fake logo ships off-brand in the final video.\n\n";
582586
writeFileSync(
583587
join(outputDir, "extracted", "asset-descriptions.md"),
584-
"# Asset Descriptions\n\nOne line per file. Read this instead of opening every image individually.\n\n" +
585-
lines.map((l) => "- " + l).join("\n") +
586-
"\n",
588+
header + lines.map((l) => "- " + l).join("\n") + "\n",
587589
"utf-8",
588590
);
589-
progress("design", `${lines.length} asset descriptions written`);
591+
progress(
592+
"design",
593+
`${lines.length} asset descriptions written${hasGeminiKey ? "" : " (no Gemini key — catalog-fallback mode)"}`,
594+
);
590595
}
591596
} catch {
592597
/* non-critical */

packages/cli/src/capture/tokenExtractor.ts

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,14 +353,41 @@ const EXTRACT_SCRIPT = `(() => {
353353
// Keep SVGs that have a label OR are at least 16px wide OR are inside a logo/brand context
354354
var inLogoContext = svg.closest('[class*="logo"], [class*="brand"], [class*="partner"], [class*="customer"], [class*="marquee"]') !== null;
355355
if (!label && !inLogoContext && (!w || parseInt(w) < 16)) return null;
356+
var isLogo = (label && label.toLowerCase().indexOf("logo") !== -1) ||
357+
svg.closest('[class*="logo"], [class*="brand"], [class*="home"], [class*="marquee"], [class*="partner"], [class*="customer"]') !== null;
358+
if (!isLogo) {
359+
var bannerEl = svg.closest('header, nav, [role="banner"]');
360+
if (bannerEl) {
361+
var firstSvg = bannerEl.querySelector('svg');
362+
if (firstSvg === svg) isLogo = true;
363+
}
364+
}
365+
if (!isLogo) {
366+
var anchor = svg.closest('a[href]');
367+
if (anchor) {
368+
var href = anchor.getAttribute('href') || '';
369+
if (href === '/' || href === '#' || href === './' ||
370+
/^https?:\\/\\/[^/]+\\/?$/.test(href)) {
371+
isLogo = true;
372+
}
373+
}
374+
}
375+
if (!isLogo) {
376+
var ariaLabel = svg.getAttribute('aria-label') || svg.getAttribute('title') || '';
377+
var titleBrand = (document.title || '').split(/[-|—]/)[0].trim();
378+
if (titleBrand.length > 1 && titleBrand.length < 30 &&
379+
ariaLabel.toLowerCase().indexOf(titleBrand.toLowerCase()) !== -1) {
380+
isLogo = true;
381+
}
382+
}
356383
var rect = svg.getBoundingClientRect();
357384
return {
358385
label: label || undefined,
359386
viewBox: svg.getAttribute("viewBox") || undefined,
360387
width: Math.round(rect.width),
361388
height: Math.round(rect.height),
362389
outerHTML: svg.outerHTML.slice(0, 10000),
363-
isLogo: (label && label.toLowerCase().indexOf("logo") !== -1) || svg.closest('[class*="logo"], [class*="brand"], [class*="home"], [class*="marquee"], [class*="partner"], [class*="customer"]') !== null
390+
isLogo: isLogo
364391
};
365392
}).filter(Boolean).slice(0, 50);
366393

packages/cli/src/commands/capture.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,20 @@ export const examples: Example[] = [
66
["Capture a website", "hyperframes capture https://stripe.com"],
77
["Capture to a specific directory", "hyperframes capture https://linear.app -o linear-video"],
88
["JSON output for AI agents", "hyperframes capture https://example.com --json"],
9+
[
10+
"Pull a video from the captured manifest by index",
11+
"hyperframes capture video ./linear-video --index 0",
12+
],
913
];
1014

1115
export default defineCommand({
1216
meta: {
1317
name: "capture",
1418
description: "Capture a website as editable HyperFrames components",
1519
},
20+
subCommands: {
21+
video: () => import("./capture/video.js").then((m) => m.default),
22+
},
1623
args: {
1724
url: {
1825
type: "positional",
@@ -46,7 +53,9 @@ export default defineCommand({
4653
async run({ args }) {
4754
const url = args.url as string;
4855

49-
// Validate URL
56+
// citty fires parent's run AFTER routing to a subcommand; skip when args.url is a subcommand name.
57+
if (url === "video") return;
58+
5059
try {
5160
new URL(url);
5261
} catch {

0 commit comments

Comments
 (0)