Skip to content

Commit 0dbeca7

Browse files
authored
Add web link preview unfurling (#367)
* feat: add web link preview unfurling * fix: send web platform for server info * refactor: use mui dialog for page history * fix: allow page history preview scrolling * fix: wait for blocked collab idb deletes * fix: address link preview branch review
1 parent 8563d1b commit 0dbeca7

21 files changed

Lines changed: 1308 additions & 57 deletions

File tree

api/_lib/__tests__/unfurl.test.ts

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { unfurl } from '../unfurl';
2+
import { isAllowedHttpUrl } from '../url-safety';
3+
4+
const originalFetch = global.fetch;
5+
6+
function mockResponse({
7+
body = '',
8+
headers = {},
9+
ok,
10+
status = 200,
11+
}: {
12+
body?: string;
13+
headers?: Record<string, string>;
14+
ok?: boolean;
15+
status?: number;
16+
}): Response {
17+
const lowerCaseHeaders = Object.fromEntries(Object.entries(headers).map(([key, value]) => [key.toLowerCase(), value]));
18+
19+
return {
20+
body: undefined,
21+
headers: {
22+
get: (name: string) => lowerCaseHeaders[name.toLowerCase()] ?? null,
23+
},
24+
ok: ok ?? status < 400,
25+
status,
26+
text: async () => body,
27+
} as Response;
28+
}
29+
30+
function htmlResponse(html: string, status = 200): Response {
31+
return mockResponse({
32+
body: html,
33+
headers: { 'content-type': 'text/html; charset=utf-8' },
34+
status,
35+
});
36+
}
37+
38+
describe('unfurl', () => {
39+
afterEach(() => {
40+
global.fetch = originalFetch;
41+
jest.clearAllMocks();
42+
});
43+
44+
it('rejects fetch failures so lower-priority providers can run', async () => {
45+
global.fetch = jest.fn().mockRejectedValue(new Error('network error')) as unknown as typeof fetch;
46+
47+
await expect(unfurl('https://example.com/path')).rejects.toThrow('network error');
48+
});
49+
50+
it('rejects non-OK responses so lower-priority providers can run', async () => {
51+
global.fetch = jest.fn().mockResolvedValue(htmlResponse('', 403)) as unknown as typeof fetch;
52+
53+
await expect(unfurl('https://example.com/blocked')).rejects.toThrow('Failed to fetch link preview: 403');
54+
});
55+
56+
it('rejects redirects to blocked hosts before fetching the redirect target', async () => {
57+
global.fetch = jest.fn().mockResolvedValue(
58+
mockResponse({
59+
status: 302,
60+
headers: { location: 'http://127.0.0.1/admin' },
61+
})
62+
) as unknown as typeof fetch;
63+
64+
await expect(unfurl('https://example.com/redirect')).rejects.toThrow('Blocked redirect target');
65+
expect(global.fetch).toHaveBeenCalledTimes(1);
66+
});
67+
68+
it('rejects redirects to IPv4-mapped IPv6 private hosts before fetching the redirect target', async () => {
69+
global.fetch = jest.fn().mockResolvedValue(
70+
mockResponse({
71+
status: 302,
72+
headers: { location: 'http://[::ffff:127.0.0.1]/admin' },
73+
})
74+
) as unknown as typeof fetch;
75+
76+
await expect(unfurl('https://example.com/redirect')).rejects.toThrow('Blocked redirect target');
77+
expect(global.fetch).toHaveBeenCalledTimes(1);
78+
});
79+
80+
it('follows allowed redirects and extracts metadata from the final response', async () => {
81+
global.fetch = jest
82+
.fn()
83+
.mockResolvedValueOnce(
84+
mockResponse({
85+
status: 302,
86+
headers: { location: 'https://redirected.example/final' },
87+
})
88+
)
89+
.mockResolvedValueOnce(
90+
htmlResponse(
91+
'<head><meta property="og:title" content="Redirected page"><meta property="og:image" content="/cover.png"></head>'
92+
)
93+
) as unknown as typeof fetch;
94+
95+
await expect(unfurl('https://example.com/start')).resolves.toMatchObject({
96+
title: 'Redirected page',
97+
description: '',
98+
image: { url: 'https://redirected.example/cover.png' },
99+
logo: { url: 'https://www.google.com/s2/favicons?domain=redirected.example&sz=128' },
100+
});
101+
102+
expect(global.fetch).toHaveBeenNthCalledWith(
103+
1,
104+
'https://example.com/start',
105+
expect.objectContaining({ redirect: 'manual' })
106+
);
107+
expect(global.fetch).toHaveBeenNthCalledWith(
108+
2,
109+
'https://redirected.example/final',
110+
expect.objectContaining({ redirect: 'manual' })
111+
);
112+
});
113+
});
114+
115+
describe('url safety', () => {
116+
it('blocks IPv4-mapped IPv6 private address literals', () => {
117+
expect(isAllowedHttpUrl(new URL('http://[::ffff:127.0.0.1]/'))).toBe(false);
118+
expect(isAllowedHttpUrl(new URL('http://[::ffff:7f00:1]/'))).toBe(false);
119+
expect(isAllowedHttpUrl(new URL('http://[::ffff:c0a8:101]/'))).toBe(false);
120+
});
121+
122+
it('allows public IPv4-mapped IPv6 address literals', () => {
123+
expect(isAllowedHttpUrl(new URL('http://[::ffff:0808:0808]/'))).toBe(true);
124+
});
125+
});

api/_lib/unfurl.ts

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
// Server-side link unfurler.
2+
//
3+
// Mirrors the desktop DefaultParser
4+
// (AppFlowy-Premium/frontend/appflowy_flutter/lib/plugins/document/presentation/
5+
// editor_plugins/link_preview/link_parsers/default_parser.dart)
6+
// so web link mentions reach parity with the desktop app: a browser cannot
7+
// scrape cross-origin pages (CORS), so the same fetch + metadata extraction
8+
// runs here instead. Prefer Open Graph, fall back to <title>, then host.
9+
//
10+
// Dependency-free on purpose: only the <head> meta/link tags are needed, so we
11+
// parse them directly rather than pulling an HTML parser into the function.
12+
13+
import { isAllowedHttpUrl } from './url-safety';
14+
15+
const MAX_HTML_BYTES = 50 * 1024; // the <head> carries all the metadata we read
16+
const REQUEST_TIMEOUT_MS = 8000;
17+
const DESCRIPTION_MAX_LENGTH = 240;
18+
const USER_AGENT = 'Mozilla/5.0 (compatible; AppFlowyBot/1.0; +https://appflowy.io)';
19+
const MAX_REDIRECTS = 5;
20+
21+
export interface UnfurlImage {
22+
url: string;
23+
}
24+
25+
export interface UnfurlResult {
26+
title: string;
27+
description: string;
28+
image?: UnfurlImage;
29+
logo?: UnfurlImage;
30+
}
31+
32+
interface FetchedHtml {
33+
response: Response;
34+
url: URL;
35+
}
36+
37+
export async function unfurl(rawUrl: string): Promise<UnfurlResult> {
38+
const initialUrl = new URL(rawUrl);
39+
const { response, url } = await fetchHtml(initialUrl);
40+
const host = url.hostname.replace(/^www\./, '');
41+
42+
if (!response.ok) {
43+
void response.body?.cancel().catch(() => undefined);
44+
throw new Error(`Failed to fetch link preview: ${response.status}`);
45+
}
46+
47+
const contentType = response.headers.get('content-type')?.toLowerCase() ?? '';
48+
49+
if (!isHtml(contentType)) {
50+
void response.body?.cancel().catch(() => undefined);
51+
return nonHtmlResult(url, host, contentType);
52+
}
53+
54+
const head = await readHead(response);
55+
56+
return extractMetadata(head, url, host);
57+
}
58+
59+
async function fetchHtml(url: URL): Promise<FetchedHtml> {
60+
const controller = new AbortController();
61+
const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
62+
63+
try {
64+
return await fetchHtmlFollowingAllowedRedirects(url, controller.signal);
65+
} finally {
66+
clearTimeout(timer);
67+
}
68+
}
69+
70+
async function fetchHtmlFollowingAllowedRedirects(initialUrl: URL, signal: AbortSignal): Promise<FetchedHtml> {
71+
let currentUrl = initialUrl;
72+
73+
for (let redirects = 0; redirects <= MAX_REDIRECTS; redirects += 1) {
74+
if (!isAllowedHttpUrl(currentUrl)) {
75+
throw new Error('Blocked redirect target');
76+
}
77+
78+
const response = await fetch(currentUrl.toString(), {
79+
redirect: 'manual',
80+
signal,
81+
headers: {
82+
'User-Agent': USER_AGENT,
83+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84+
},
85+
});
86+
87+
if (!isRedirectResponse(response.status)) return { response, url: currentUrl };
88+
89+
const location = response.headers.get('location');
90+
91+
void response.body?.cancel().catch(() => undefined);
92+
if (!location) throw new Error('Redirect response missing Location header');
93+
94+
const nextUrl = new URL(location, currentUrl);
95+
96+
if (!isAllowedHttpUrl(nextUrl)) {
97+
throw new Error('Blocked redirect target');
98+
}
99+
100+
currentUrl = nextUrl;
101+
}
102+
103+
throw new Error('Too many redirects');
104+
}
105+
106+
function isRedirectResponse(status: number): boolean {
107+
return status === 301 || status === 302 || status === 303 || status === 307 || status === 308;
108+
}
109+
110+
function isHtml(contentType: string): boolean {
111+
return contentType === '' || contentType.includes('text/html') || contentType.includes('application/xhtml');
112+
}
113+
114+
// Read only up to </head> (or 50KB) to keep the function fast and cheap.
115+
async function readHead(response: Response): Promise<string> {
116+
const reader = response.body?.getReader();
117+
118+
if (!reader) return response.text();
119+
120+
const decoder = new TextDecoder('utf-8');
121+
let html = '';
122+
let received = 0;
123+
124+
while (received < MAX_HTML_BYTES) {
125+
const { done, value } = await reader.read();
126+
127+
if (done) break;
128+
received += value.byteLength;
129+
html += decoder.decode(value, { stream: true });
130+
131+
const headEnd = html.toLowerCase().indexOf('</head>');
132+
133+
if (headEnd !== -1) {
134+
html = html.slice(0, headEnd + '</head>'.length);
135+
break;
136+
}
137+
}
138+
139+
void reader.cancel().catch(() => undefined);
140+
return html;
141+
}
142+
143+
function extractMetadata(head: string, url: URL, host: string): UnfurlResult {
144+
const metas = matchTags(head, 'meta');
145+
const links = matchTags(head, 'link');
146+
147+
const og = (property: string) => metas.find((attrs) => attrs.property === property)?.content;
148+
const named = (name: string) => metas.find((attrs) => attrs.name === name)?.content;
149+
150+
const title = clean(og('og:title')) || clean(extractTitleTag(head)) || clean(named('title')) || host;
151+
const description = clean(og('og:description')) || clean(named('description'));
152+
const image = resolveOptional(url, og('og:image'));
153+
const favicon = extractFavicon(links, url) ?? defaultFavicon(host);
154+
155+
return {
156+
title,
157+
description: truncate(description),
158+
...(image ? { image: { url: image } } : {}),
159+
logo: { url: favicon },
160+
};
161+
}
162+
163+
function extractFavicon(links: Array<Record<string, string>>, url: URL): string | undefined {
164+
const rels = ['icon', 'shortcut icon', 'apple-touch-icon', 'apple-touch-icon-precomposed'];
165+
166+
for (const rel of rels) {
167+
const href = links.find((attrs) => (attrs.rel ?? '').toLowerCase() === rel)?.href;
168+
169+
if (href) return resolveOptional(url, href);
170+
}
171+
172+
const anyIcon = links.find((attrs) => (attrs.rel ?? '').toLowerCase().includes('icon'))?.href;
173+
174+
return anyIcon ? resolveOptional(url, anyIcon) : undefined;
175+
}
176+
177+
function matchTags(html: string, tag: 'meta' | 'link'): Array<Record<string, string>> {
178+
const regex = new RegExp(`<${tag}\\b[^>]*>`, 'gi');
179+
180+
return (html.match(regex) ?? []).map(parseAttributes);
181+
}
182+
183+
const ATTR_REGEX = /([a-zA-Z_:.-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/g;
184+
185+
function parseAttributes(tag: string): Record<string, string> {
186+
const attrs: Record<string, string> = {};
187+
let match: RegExpExecArray | null;
188+
189+
ATTR_REGEX.lastIndex = 0;
190+
while ((match = ATTR_REGEX.exec(tag)) !== null) {
191+
attrs[match[1].toLowerCase()] = match[2] ?? match[3] ?? match[4] ?? '';
192+
}
193+
194+
return attrs;
195+
}
196+
197+
function extractTitleTag(html: string): string | undefined {
198+
return /<title[^>]*>([\s\S]*?)<\/title>/i.exec(html)?.[1];
199+
}
200+
201+
function nonHtmlResult(url: URL, host: string, contentType: string): UnfurlResult {
202+
const filename = url.pathname.split('/').filter(Boolean).pop() || host;
203+
204+
return {
205+
title: decodeURIComponentSafe(filename),
206+
description: contentType ? `Type: ${contentType}` : '',
207+
logo: { url: defaultFavicon(host) },
208+
};
209+
}
210+
211+
function defaultFavicon(host: string): string {
212+
return `https://www.google.com/s2/favicons?domain=${host}&sz=128`;
213+
}
214+
215+
function resolveOptional(base: URL, href?: string): string | undefined {
216+
if (!href) return undefined;
217+
218+
const decoded = decodeEntities(href).trim();
219+
220+
if (!decoded) return undefined;
221+
222+
try {
223+
return new URL(decoded, base).toString();
224+
} catch {
225+
return decoded;
226+
}
227+
}
228+
229+
function clean(value?: string): string {
230+
return decodeEntities(value ?? '')
231+
.replace(/\s+/g, ' ')
232+
.trim();
233+
}
234+
235+
function truncate(value: string): string {
236+
if (value.length <= DESCRIPTION_MAX_LENGTH) return value;
237+
return `${value.slice(0, DESCRIPTION_MAX_LENGTH - 1).trimEnd()}…`;
238+
}
239+
240+
function decodeURIComponentSafe(value: string): string {
241+
try {
242+
return decodeURIComponent(value);
243+
} catch {
244+
return value;
245+
}
246+
}
247+
248+
const NAMED_ENTITIES: Record<string, string> = {
249+
amp: '&',
250+
lt: '<',
251+
gt: '>',
252+
quot: '"',
253+
apos: "'",
254+
nbsp: ' ',
255+
};
256+
257+
function decodeEntities(value: string): string {
258+
return value.replace(/&(#x?[0-9a-fA-F]+|[a-zA-Z]+);/g, (match, entity: string) => {
259+
if (entity[0] === '#') {
260+
const isHex = entity[1] === 'x' || entity[1] === 'X';
261+
const code = isHex ? parseInt(entity.slice(2), 16) : parseInt(entity.slice(1), 10);
262+
263+
return Number.isFinite(code) ? String.fromCodePoint(code) : match;
264+
}
265+
266+
return NAMED_ENTITIES[entity.toLowerCase()] ?? match;
267+
});
268+
}

0 commit comments

Comments
 (0)