Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/deploy-live.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,21 @@ jobs:
- name: Install dependencies
run: yarn --prefer-offline

# needed for fetching Hashicorp blog feed
- name: Install Playwright
run: |
npx playwright install chromium
npx playwright install-deps chromium

- name: Cache Playwright
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: |
~/.cache/ms-playwright
key: ${{ runner.os }}-playwright-${{ hashFiles('package.json') }}
restore-keys: |
${{ runner.os }}-playwright-

- name: Build site
run: REACT_APP_ERROR_REPORTER_APIKEY=${{ secrets.ERROR_REPORTER_APIKEY }} REACT_APP_FIREBASE_APIKEY=${{ secrets.FIREBASE_APIKEY }} REACT_APP_RECAPTCHA_APIKEY=${{ secrets.RECAPTCHA_APIKEY }} yarn build-github && zip -r build.zip build

Expand Down
15 changes: 15 additions & 0 deletions .github/workflows/deploy-preview.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,21 @@ jobs:
run: |
echo "Building the following products: $PRODUCTS_INCLUDE"

# needed for fetching Hashicorp blog feed
- name: Install Playwright
run: |
npx playwright install chromium
npx playwright install-deps chromium

- name: Cache Playwright
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: |
~/.cache/ms-playwright
key: ${{ runner.os }}-playwright-${{ hashFiles('package.json') }}
restore-keys: |
${{ runner.os }}-playwright-

- name: Build site
run: yarn build-github

Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,7 @@ products/**/versions.json
# downloaded feeds/blogs
src/**/blogs.json
src/**/feeds.json

# playwright
.playwright-storage.json
.pw-user-data/
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
"cross-env": "^7.0.3",
"husky": "^8.0.2",
"lint-staged": "^13.0.3",
"playwright": "^1.55.0",
"postcss": "^8.5.6",
"postcss-import": "^16.1.1",
"prettier": "2.7.1",
Expand Down
199 changes: 189 additions & 10 deletions scripts/feed2json.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
// rssParser.js
// Plain HTTP(S) fetch -> parse RSS/Atom to JSON Feed.
// Playwright fallback ONLY when Vercel challenge is detected (429 + challenge headers).

const fs = require("fs");
const https = require("https");
const http = require("http");
const { URL } = require("url");
const xml2js = require("xml2js");

// Helper to fetch XML from URL
const VERBOSE = process.env.FEED_DEBUG === "1";
const STORAGE_STATE_PATH = "./.playwright-storage.json";

// ---------------- Helper: plain fetch (KEEPING ORIGINAL BEHAVIOR) ----------------
function fetchXmlFromUrl(feedUrl) {
return new Promise((resolve, reject) => {
const url = new URL(feedUrl);
const client = url.protocol === "https:" ? https : http;

const req = client.get(feedUrl, (res) => {
if (VERBOSE) {
console.error("[plain] status:", res.statusCode);
console.error("[plain] headers:", res.headers);
}

if (res.statusCode !== 200) {
return reject(
new Error(`Request failed with status ${res.statusCode}`)
);
const err = new Error(`Request failed with status ${res.statusCode}`);
// attach headers so caller can decide if it's a Vercel challenge
err.statusCode = res.statusCode;
err.headers = res.headers;
res.resume(); // drain
return reject(err);
}

let data = "";
Expand All @@ -29,26 +43,192 @@ function fetchXmlFromUrl(feedUrl) {
});
}

// Function to convert XML feed (RSS 2.0 or Atom) to JSON Feed format
// ---------------- Vercel challenge detection (strict) ----------------
function isVercelChallenge(errOrHeaders) {
const headers = errOrHeaders?.headers || errOrHeaders || {};
const status = errOrHeaders?.statusCode;
const mitigated = (headers["x-vercel-mitigated"] || "")
.toString()
.toLowerCase();
const challenge = headers["x-vercel-challenge-token"];
return status === 429 && (!!challenge || mitigated === "challenge");
}

// ---------------- Playwright fallback (ephemeral, no Keychain) ----------------
async function fetchXmlWithPlaywright(feedUrl, opts = {}) {
const { chromium, devices } = require("playwright");

const u = new URL(feedUrl);
const isHashi = /(^|\.)hashicorp\.com$/i.test(u.hostname);
const prewarmUrl =
opts.prewarmUrl || (isHashi ? `${u.origin}/en/blog/all` : u.origin);
const storageStatePath = opts.storageStatePath || STORAGE_STATE_PATH;

const browser = await chromium.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-renderer-backgrounding",
"--disable-features=UseKeychain", // macOS hint to avoid Keychain
"--use-mock-keychain", // macOS hint
"--password-store=basic", // Linux: avoid Keyring/KWallet
],
});

const contextOptions = {
userAgent:
devices["Desktop Chrome"]?.userAgent ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
viewport: { width: 900, height: 700 },
javaScriptEnabled: true,
};
if (fs.existsSync(storageStatePath)) {
contextOptions.storageState = storageStatePath; // reuse cookies if we have them
}
const context = await browser.newContext(contextOptions);

// Speed: block heavy/irrelevant resource types
await context.route("**/*", (route) => {
const type = route.request().resourceType();
const url = route.request().url();
if (
type === "image" ||
type === "media" ||
type === "font" ||
type === "stylesheet"
) {
return route.abort();
}
if (
/\b(googletagmanager|google-analytics|doubleclick|segment|mixpanel|hotjar|sentry)\b/i.test(
url
)
) {
return route.abort();
}
return route.continue();
});

try {
const page = await context.newPage();

// If no cookies yet, prewarm once so any JS challenge runs
if (!fs.existsSync(storageStatePath)) {
if (VERBOSE) console.error("[pw] prewarming:", prewarmUrl);
await page.goto(prewarmUrl, {
waitUntil: "domcontentloaded",
timeout: 45000,
});
await page
.waitForLoadState("networkidle", { timeout: 8000 })
.catch(() => {});
try {
const state = await context.storageState();
fs.writeFileSync(storageStatePath, JSON.stringify(state, null, 2));
if (VERBOSE) console.error("[pw] storageState saved (after prewarm)");
} catch {}
}

if (VERBOSE) console.error("[pw] fetching feed:", feedUrl);
const resp = await page.goto(feedUrl, {
waitUntil: "domcontentloaded",
timeout: 45000,
});
if (!resp) throw new Error("[pw] No response for feed URL");

const status = resp.status();
const headers = resp.headers();
if (VERBOSE) console.error("[pw] status:", status, "headers:", headers);

if (status !== 200) {
// Try one refresh cycle if stored cookies are stale
if (fs.existsSync(storageStatePath)) {
if (VERBOSE)
console.error("[pw] stale cookies? retrying with fresh prewarm…");
await context.clearCookies();
await context.clearPermissions();
await page.goto(prewarmUrl, {
waitUntil: "domcontentloaded",
timeout: 45000,
});
await page
.waitForLoadState("networkidle", { timeout: 8000 })
.catch(() => {});
const resp2 = await page.goto(feedUrl, {
waitUntil: "domcontentloaded",
timeout: 45000,
});
if (resp2 && resp2.status() === 200) {
const xml2 = await resp2.text();
try {
const state2 = await context.storageState();
fs.writeFileSync(storageStatePath, JSON.stringify(state2, null, 2));
if (VERBOSE) console.error("[pw] storageState refreshed");
} catch {}
await browser.close();
return xml2;
}
}
const e = new Error(`[pw] Feed request failed: ${status}`);
e.statusCode = status;
e.headers = headers;
throw e;
}

const xml = await resp.text();
try {
const state = await context.storageState();
fs.writeFileSync(storageStatePath, JSON.stringify(state, null, 2));
if (VERBOSE) console.error("[pw] storageState saved");
} catch {}

await browser.close();
return xml;
} catch (e) {
try {
await browser.close();
} catch {}
throw e;
}
}

// ---------------- RSS/Atom → JSON Feed ----------------
async function parseRSS(source) {
try {
let xml;

if (source.startsWith("http://") || source.startsWith("https://")) {
// ORIGINAL PATH FIRST (unchanged behavior for Medium & others)
try {
xml = await fetchXmlFromUrl(source);
} catch (err) {
if (err.code === "SELF_SIGNED_CERT_IN_CHAIN") {
// If the site is Vercel and challenged us, use the Playwright fallback
if (isVercelChallenge(err)) {
if (VERBOSE)
console.error(
"[main] Vercel 429 challenge → falling back to Playwright"
);
xml = await fetchXmlWithPlaywright(source);
} else if (err && err.code === "SELF_SIGNED_CERT_IN_CHAIN") {
// original graceful handling
return {
version: "https://jsonfeed.org/version/1",
title: "",
home_page_url: source,
description: "",
items: [],
};
} else {
// Not a Vercel case → bubble up (preserves original behavior)
throw err;
}
throw err;
}
} else {
// Local file path (original behavior)
xml = fs.readFileSync(source, "utf-8");
}

Expand All @@ -73,7 +253,7 @@ async function parseRSS(source) {
title: channel.title,
home_page_url: channel.link,
description: channel.description,
author: { name: channel.webMaster },
author: channel.webMaster ? { name: channel.webMaster } : undefined,
items: items.filter(Boolean).map((item) => ({
guid: item.guid,
url: item.link,
Expand Down Expand Up @@ -126,12 +306,11 @@ async function parseRSS(source) {
}
}

// Example usage:
// ---------------- Example usage ----------------
(async () => {
const source = process.argv[2] || "example.xml"; // File path or URL passed via CLI
const jsonOutput = await parseRSS(source);
console.log(JSON.stringify(jsonOutput, null, 2));
})();

// Export function if needed for external usage
module.exports = { parseRSS };
19 changes: 19 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6259,6 +6259,11 @@ fs.realpath@^1.0.0:
resolved "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f"
integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==

fsevents@2.3.2:
version "2.3.2"
resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a"
integrity sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==

fsevents@~2.3.2:
version "2.3.3"
resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz#cac6407785d03675a2a5e1a5305c697b347d90d6"
Expand Down Expand Up @@ -9605,6 +9610,20 @@ pkg-types@^2.0.1:
exsolve "^1.0.7"
pathe "^2.0.3"

playwright-core@1.55.0:
version "1.55.0"
resolved "https://registry.npmjs.org/playwright-core/-/playwright-core-1.55.0.tgz#ec8a9f8ef118afb3e86e0f46f1393e3bea32adf4"
integrity sha512-GvZs4vU3U5ro2nZpeiwyb0zuFaqb9sUiAJuyrWpcGouD8y9/HLgGbNRjIph7zU9D3hnPaisMl9zG9CgFi/biIg==

playwright@^1.55.0:
version "1.55.0"
resolved "https://registry.npmjs.org/playwright/-/playwright-1.55.0.tgz#7aca7ac3ffd9e083a8ad8b2514d6f9ba401cc78b"
integrity sha512-sdCWStblvV1YU909Xqx0DhOjPZE4/5lJsIS84IfN9dAZfcl/CIZ5O8l3o0j7hPMjDvqoTF8ZUcc+i/GL5erstA==
dependencies:
playwright-core "1.55.0"
optionalDependencies:
fsevents "2.3.2"

plugin-sitemap-coveo@./plugin-sitemap-coveo:
version "1.0.0"
dependencies:
Expand Down