Skip to content

Commit 9e9af86

Browse files
NiallJoeMaherclaude
andcommitted
fix(feed): repair broken article thumbnails from doubled image URLs
HackerNoon's RSS media:thumbnail/media:content URLs are malformed at the source — an already-absolute CDN URL prefixed with their own origin (https://hackernoon.com/https://cdn.hackernoon.com/…), which 404s. Our ingestion stored them verbatim, and the redesigned cards SSR the <img>, so the broken-image icon stuck: the error event fires before React hydrates and attaches onError, so the fallback never runs. - add unwrapDoubledUrl() (utils/url.ts) + unit tests - card: unwrap at render and detect pre-hydration failures via a ref callback, so a dead image collapses to no thumbnail (not a broken icon) - sanitise URLs at ingestion (fetch-rss, admin/sync-feeds — media + OG) - one-off scrub script for already-stored rows Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 5f723be commit 9e9af86

6 files changed

Lines changed: 180 additions & 11 deletions

File tree

app/api/admin/sync-feeds/route.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import { eq } from "drizzle-orm";
1111
import Parser from "rss-parser";
1212
import { customAlphabet } from "nanoid";
1313
import { fetchOgImage } from "@/lib/og-image";
14+
import { ensureHttps, unwrapDoubledUrl } from "@/utils/url";
1415

1516
// Generate Reddit-style short IDs: lowercase + numbers, 7 characters
1617
const generateShortId = customAlphabet(
@@ -128,10 +129,14 @@ function extractImageUrl(item: Parser.Item): string | null {
128129
| { url?: string; type?: string }
129130
| undefined;
130131

131-
if (mediaContent?.$?.url) return mediaContent.$.url;
132-
if (mediaThumbnail?.$?.url) return mediaThumbnail.$.url;
132+
// Some feeds (e.g. HackerNoon's media:thumbnail) prefix an already-absolute
133+
// CDN URL with their own origin, producing a 404ing doubled URL — unwrap it.
134+
if (mediaContent?.$?.url)
135+
return ensureHttps(unwrapDoubledUrl(mediaContent.$.url));
136+
if (mediaThumbnail?.$?.url)
137+
return ensureHttps(unwrapDoubledUrl(mediaThumbnail.$.url));
133138
if (enclosure?.url && enclosure.type?.startsWith("image/"))
134-
return enclosure.url;
139+
return ensureHttps(unwrapDoubledUrl(enclosure.url));
135140

136141
return null;
137142
}
@@ -246,7 +251,9 @@ export async function POST(request: Request) {
246251

247252
// Fetch OG image from the article URL
248253
try {
249-
const ogImageUrl = await fetchOgImage(item.link);
254+
const ogImageUrl = ensureHttps(
255+
unwrapDoubledUrl(await fetchOgImage(item.link)),
256+
);
250257
if (ogImageUrl) {
251258
await db
252259
.update(aggregated_article)

components/UnifiedContentCard/UnifiedContentCard.tsx

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
"use client";
22

3-
import { useState } from "react";
3+
import { useCallback, useState } from "react";
44
import Link from "next/link";
55
import * as Sentry from "@sentry/nextjs";
66
import { api } from "@/server/trpc/react";
77
import { signIn, useSession } from "next-auth/react";
88
import { toast } from "sonner";
99
import VoteControl from "@/components/Vote/VoteControl";
1010
import { ReportButton } from "@/components/ReportModal/ReportModal";
11-
import { ensureHttps } from "@/utils/url";
11+
import { ensureHttps, unwrapDoubledUrl } from "@/utils/url";
1212
import { getRelativeTime } from "@/utils/relativeTime";
1313

1414
export type ContentType = "POST" | "LINK";
@@ -94,10 +94,18 @@ const UnifiedContentCard = ({
9494
const [isBookmarked, setIsBookmarked] = useState(initialBookmarked);
9595
const [shared, setShared] = useState(false);
9696

97+
// SSR race: a broken <img> begins loading during HTML parse and its `error`
98+
// event can fire before React hydrates and attaches `onError`, so the handler
99+
// never runs and the broken image sticks. This ref callback runs on mount — a
100+
// loaded-but-zero-size image already failed — and hides it like onError would.
101+
const checkBrokenImage = useCallback((node: HTMLImageElement | null) => {
102+
if (node?.complete && node.naturalWidth === 0) setImageError(true);
103+
}, []);
104+
97105
const { data: session } = useSession();
98106
const utils = api.useUtils();
99107

100-
const imageUrl = ensureHttps(rawImageUrl);
108+
const imageUrl = ensureHttps(unwrapDoubledUrl(rawImageUrl));
101109

102110
// Card URL priority (slug ends with urlId, so it stays canonical; urlId is the
103111
// fallback when slug is missing): discussion /d/ > member /{username}/ >
@@ -200,7 +208,7 @@ const UnifiedContentCard = ({
200208

201209
return (
202210
<article
203-
className="group rounded-lg border border-hairline bg-surface p-5 transition-colors duration-base ease-out hover:border-strong"
211+
className="group rounded-lg border border-hairline bg-surface p-4 transition-colors duration-base ease-out hover:border-strong sm:p-5"
204212
data-testid="content-card"
205213
>
206214
<div className="flex gap-4">
@@ -226,17 +234,17 @@ const UnifiedContentCard = ({
226234
(handleHref ? (
227235
<Link
228236
href={handleHref}
229-
className="whitespace-nowrap text-sm font-semibold text-fg hover:underline"
237+
className="min-w-0 max-w-full truncate text-sm font-semibold text-fg hover:underline"
230238
>
231239
{authorName}
232240
</Link>
233241
) : (
234-
<span className="whitespace-nowrap text-sm font-semibold text-fg">
242+
<span className="min-w-0 max-w-full truncate text-sm font-semibold text-fg">
235243
{authorName}
236244
</span>
237245
))}
238246
<span
239-
className="whitespace-nowrap font-mono text-xs text-faint"
247+
className="min-w-0 basis-full truncate font-mono text-xs text-faint"
240248
// Cards now SSR (feed initialData) and relative times derive
241249
// from Date.now() — a minute boundary between server render and
242250
// hydration would otherwise log a text mismatch.
@@ -293,6 +301,7 @@ const UnifiedContentCard = ({
293301
className="relative h-[68px] w-[104px] flex-shrink-0 self-start overflow-hidden rounded-sm border border-hairline"
294302
>
295303
<img
304+
ref={checkBrokenImage}
296305
src={imageUrl}
297306
alt=""
298307
className="h-full w-full object-cover"

scripts/fetch-rss.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import { eq, and, isNotNull } from "drizzle-orm";
1313
import { nanoid } from "nanoid";
1414
import Parser from "rss-parser";
1515
import crypto from "crypto";
16+
import { ensureHttps, unwrapDoubledUrl } from "../utils/url";
1617

1718
const parser = new Parser({
1819
timeout: 10000,
@@ -231,6 +232,10 @@ async function fetchAndProcessFeed(source: FeedSource) {
231232
console.log(` ✓ ${readTimeMins} min read`);
232233
}
233234

235+
// Some feeds (e.g. HackerNoon's media:thumbnail) prefix an already-
236+
// absolute CDN URL with their own origin, producing a 404ing doubled URL.
237+
imageUrl = ensureHttps(unwrapDoubledUrl(imageUrl));
238+
234239
// Rate limit: small delay between fetches
235240
await delay(200);
236241

scripts/fix-doubled-image-urls.ts

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/**
2+
* One-off scrub for image URLs that an upstream feed prefixed with its own
3+
* origin, leaving a doubled-up, 404ing URL — e.g. HackerNoon's media:thumbnail:
4+
* https://hackernoon.com/https://cdn.hackernoon.com/images/x.png
5+
*
6+
* The card now unwraps these at render, so this only cleans the stored data
7+
* (used by server-side OG/SEO tags). Safe to re-run — it's idempotent.
8+
*
9+
* Usage: npx tsx scripts/fix-doubled-image-urls.ts
10+
*/
11+
12+
import { db } from "../server/db";
13+
import { posts, aggregated_article } from "../server/db/schema";
14+
import { eq, like, or } from "drizzle-orm";
15+
import { ensureHttps, unwrapDoubledUrl } from "../utils/url";
16+
17+
// Matches a scheme that appears AFTER the first character — i.e. a second
18+
// embedded "http(s)://" further along the string. Cheap pre-filter; the real
19+
// decision is made by unwrapDoubledUrl per row.
20+
const DOUBLED = "%://http%://%";
21+
22+
async function fixPosts() {
23+
const rows = await db
24+
.select({ id: posts.id, coverImage: posts.coverImage })
25+
.from(posts)
26+
.where(like(posts.coverImage, DOUBLED));
27+
28+
let fixed = 0;
29+
for (const row of rows) {
30+
const cleaned = ensureHttps(unwrapDoubledUrl(row.coverImage));
31+
if (cleaned !== row.coverImage) {
32+
await db
33+
.update(posts)
34+
.set({ coverImage: cleaned })
35+
.where(eq(posts.id, row.id));
36+
fixed++;
37+
}
38+
}
39+
console.log(`posts.coverImage: ${fixed}/${rows.length} fixed`);
40+
}
41+
42+
async function fixAggregatedArticles() {
43+
const rows = await db
44+
.select({
45+
id: aggregated_article.id,
46+
imageUrl: aggregated_article.imageUrl,
47+
ogImageUrl: aggregated_article.ogImageUrl,
48+
})
49+
.from(aggregated_article)
50+
.where(
51+
or(
52+
like(aggregated_article.imageUrl, DOUBLED),
53+
like(aggregated_article.ogImageUrl, DOUBLED),
54+
),
55+
);
56+
57+
let fixed = 0;
58+
for (const row of rows) {
59+
const imageUrl = ensureHttps(unwrapDoubledUrl(row.imageUrl));
60+
const ogImageUrl = ensureHttps(unwrapDoubledUrl(row.ogImageUrl));
61+
if (imageUrl !== row.imageUrl || ogImageUrl !== row.ogImageUrl) {
62+
await db
63+
.update(aggregated_article)
64+
.set({ imageUrl, ogImageUrl })
65+
.where(eq(aggregated_article.id, row.id));
66+
fixed++;
67+
}
68+
}
69+
console.log(`aggregated_article: ${fixed}/${rows.length} fixed`);
70+
}
71+
72+
async function main() {
73+
console.log("=== Scrubbing doubled image URLs ===");
74+
await fixPosts();
75+
await fixAggregatedArticles();
76+
console.log("Done.");
77+
process.exit(0);
78+
}
79+
80+
main().catch((error) => {
81+
console.error("Fatal error:", error);
82+
process.exit(1);
83+
});

utils/url.test.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { describe, it, expect } from "vitest";
2+
import { unwrapDoubledUrl, ensureHttps } from "./url";
3+
4+
describe("unwrapDoubledUrl", () => {
5+
it("unwraps a URL prefixed with another origin (HackerNoon media:thumbnail bug)", () => {
6+
expect(
7+
unwrapDoubledUrl(
8+
"https://hackernoon.com/https://cdn.hackernoon.com/images/A7coZ0.png",
9+
),
10+
).toBe("https://cdn.hackernoon.com/images/A7coZ0.png");
11+
});
12+
13+
it("unwraps an http-wrapped https URL, keeping the inner scheme", () => {
14+
expect(
15+
unwrapDoubledUrl("http://example.com/https://cdn.example.com/x.png"),
16+
).toBe("https://cdn.example.com/x.png");
17+
});
18+
19+
it("leaves a normal absolute URL untouched", () => {
20+
expect(unwrapDoubledUrl("https://cdn.thenewstack.io/media/x.png")).toBe(
21+
"https://cdn.thenewstack.io/media/x.png",
22+
);
23+
});
24+
25+
it("does not treat a scheme in the query string as a wrapper", () => {
26+
// Only one real scheme at index 0 — a `?url=https://...` proxy param is left alone.
27+
expect(
28+
unwrapDoubledUrl("https://img.proxy/optimize?url=https://cdn.site/x.png"),
29+
).toBe("https://img.proxy/optimize?url=https://cdn.site/x.png");
30+
});
31+
32+
it("passes through null/empty", () => {
33+
expect(unwrapDoubledUrl(null)).toBeNull();
34+
expect(unwrapDoubledUrl(undefined)).toBeNull();
35+
expect(unwrapDoubledUrl("")).toBeNull();
36+
});
37+
38+
it("composes with ensureHttps to clean a wrapped http CDN url", () => {
39+
expect(
40+
ensureHttps(
41+
unwrapDoubledUrl(
42+
"https://hackernoon.com/http://cdn.hackernoon.com/x.png",
43+
),
44+
),
45+
).toBe("https://cdn.hackernoon.com/x.png");
46+
});
47+
});

utils/url.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,24 @@ export function safeExternalHref(
2929
return parsed.protocol === "https:" ? url.trim() : undefined;
3030
}
3131

32+
/**
33+
* Unwraps a URL that an upstream feed prefixed with its own origin, leaving an
34+
* already-absolute URL doubled up — e.g. HackerNoon's `media:thumbnail` serves
35+
* `https://hackernoon.com/https://cdn.hackernoon.com/x.png`, which 404s. We
36+
* slice from the LAST embedded scheme so the inner, real URL wins. A normal URL
37+
* (only scheme at index 0) and a `?url=https://...` proxy param are left intact.
38+
*/
39+
export function unwrapDoubledUrl(
40+
url: string | null | undefined,
41+
): string | null {
42+
if (!url) return null;
43+
// Ignore a scheme that appears inside the query string — only unwrap when the
44+
// embedded scheme sits in the path (before any `?`).
45+
const path = url.split("?")[0];
46+
const i = Math.max(path.lastIndexOf("http://"), path.lastIndexOf("https://"));
47+
return i > 0 ? url.slice(i) : url;
48+
}
49+
3250
/** Upgrades an `http://` URL to `https://`. Returns null for empty input. */
3351
export function ensureHttps(url: string | null | undefined): string | null {
3452
if (!url) return null;

0 commit comments

Comments
 (0)