diff --git a/.github/workflows/deploy-live.yml b/.github/workflows/deploy-live.yml index 911de3281..33fddd09e 100644 --- a/.github/workflows/deploy-live.yml +++ b/.github/workflows/deploy-live.yml @@ -44,6 +44,21 @@ jobs: - name: Install dependencies run: yarn --prefer-offline + # needed for fetching Hashicorp blog feed + - name: Install Playwright + run: | + npx playwright install chromium + npx playwright install-deps chromium + + - name: Cache Playwright + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 + with: + path: | + ~/.cache/ms-playwright + key: ${{ runner.os }}-playwright-${{ hashFiles('package.json') }} + restore-keys: | + ${{ runner.os }}-playwright- + - name: Build site run: REACT_APP_ERROR_REPORTER_APIKEY=${{ secrets.ERROR_REPORTER_APIKEY }} REACT_APP_FIREBASE_APIKEY=${{ secrets.FIREBASE_APIKEY }} REACT_APP_RECAPTCHA_APIKEY=${{ secrets.RECAPTCHA_APIKEY }} yarn build-github && zip -r build.zip build diff --git a/.github/workflows/deploy-preview.yml b/.github/workflows/deploy-preview.yml index 860e567ca..286be0a11 100644 --- a/.github/workflows/deploy-preview.yml +++ b/.github/workflows/deploy-preview.yml @@ -163,6 +163,21 @@ jobs: run: | echo "Building the following products: $PRODUCTS_INCLUDE" + # needed for fetching Hashicorp blog feed + - name: Install Playwright + run: | + npx playwright install chromium + npx playwright install-deps chromium + + - name: Cache Playwright + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 + with: + path: | + ~/.cache/ms-playwright + key: ${{ runner.os }}-playwright-${{ hashFiles('package.json') }} + restore-keys: | + ${{ runner.os }}-playwright- + - name: Build site run: yarn build-github diff --git a/.gitignore b/.gitignore index a69ce09f7..ea3be92ff 100755 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,7 @@ products/**/versions.json # downloaded feeds/blogs src/**/blogs.json src/**/feeds.json + +# playwright +.playwright-storage.json +.pw-user-data/ diff --git a/package.json b/package.json index 368cd13f6..b4cbce01c 100644 --- a/package.json +++ b/package.json @@ -90,6 +90,7 @@ "cross-env": "^7.0.3", "husky": "^8.0.2", "lint-staged": "^13.0.3", + "playwright": "^1.55.0", "postcss": "^8.5.6", "postcss-import": "^16.1.1", "prettier": "2.7.1", diff --git a/scripts/feed2json.js b/scripts/feed2json.js index 3a35535ce..bf89e3043 100644 --- a/scripts/feed2json.js +++ b/scripts/feed2json.js @@ -1,21 +1,35 @@ // rssParser.js +// Plain HTTP(S) fetch -> parse RSS/Atom to JSON Feed. +// Playwright fallback ONLY when Vercel challenge is detected (429 + challenge headers). + const fs = require("fs"); const https = require("https"); const http = require("http"); const { URL } = require("url"); const xml2js = require("xml2js"); -// Helper to fetch XML from URL +const VERBOSE = process.env.FEED_DEBUG === "1"; +const STORAGE_STATE_PATH = "./.playwright-storage.json"; + +// ---------------- Helper: plain fetch (KEEPING ORIGINAL BEHAVIOR) ---------------- function fetchXmlFromUrl(feedUrl) { return new Promise((resolve, reject) => { const url = new URL(feedUrl); const client = url.protocol === "https:" ? https : http; const req = client.get(feedUrl, (res) => { + if (VERBOSE) { + console.error("[plain] status:", res.statusCode); + console.error("[plain] headers:", res.headers); + } + if (res.statusCode !== 200) { - return reject( - new Error(`Request failed with status ${res.statusCode}`) - ); + const err = new Error(`Request failed with status ${res.statusCode}`); + // attach headers so caller can decide if it's a Vercel challenge + err.statusCode = res.statusCode; + err.headers = res.headers; + res.resume(); // drain + return reject(err); } let data = ""; @@ -29,15 +43,178 @@ function fetchXmlFromUrl(feedUrl) { }); } -// Function to convert XML feed (RSS 2.0 or Atom) to JSON Feed format +// ---------------- Vercel challenge detection (strict) ---------------- +function isVercelChallenge(errOrHeaders) { + const headers = errOrHeaders?.headers || errOrHeaders || {}; + const status = errOrHeaders?.statusCode; + const mitigated = (headers["x-vercel-mitigated"] || "") + .toString() + .toLowerCase(); + const challenge = headers["x-vercel-challenge-token"]; + return status === 429 && (!!challenge || mitigated === "challenge"); +} + +// ---------------- Playwright fallback (ephemeral, no Keychain) ---------------- +async function fetchXmlWithPlaywright(feedUrl, opts = {}) { + const { chromium, devices } = require("playwright"); + + const u = new URL(feedUrl); + const isHashi = /(^|\.)hashicorp\.com$/i.test(u.hostname); + const prewarmUrl = + opts.prewarmUrl || (isHashi ? `${u.origin}/en/blog/all` : u.origin); + const storageStatePath = opts.storageStatePath || STORAGE_STATE_PATH; + + const browser = await chromium.launch({ + headless: true, + args: [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-gpu", + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-renderer-backgrounding", + "--disable-features=UseKeychain", // macOS hint to avoid Keychain + "--use-mock-keychain", // macOS hint + "--password-store=basic", // Linux: avoid Keyring/KWallet + ], + }); + + const contextOptions = { + userAgent: + devices["Desktop Chrome"]?.userAgent || + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36", + viewport: { width: 900, height: 700 }, + javaScriptEnabled: true, + }; + if (fs.existsSync(storageStatePath)) { + contextOptions.storageState = storageStatePath; // reuse cookies if we have them + } + const context = await browser.newContext(contextOptions); + + // Speed: block heavy/irrelevant resource types + await context.route("**/*", (route) => { + const type = route.request().resourceType(); + const url = route.request().url(); + if ( + type === "image" || + type === "media" || + type === "font" || + type === "stylesheet" + ) { + return route.abort(); + } + if ( + /\b(googletagmanager|google-analytics|doubleclick|segment|mixpanel|hotjar|sentry)\b/i.test( + url + ) + ) { + return route.abort(); + } + return route.continue(); + }); + + try { + const page = await context.newPage(); + + // If no cookies yet, prewarm once so any JS challenge runs + if (!fs.existsSync(storageStatePath)) { + if (VERBOSE) console.error("[pw] prewarming:", prewarmUrl); + await page.goto(prewarmUrl, { + waitUntil: "domcontentloaded", + timeout: 45000, + }); + await page + .waitForLoadState("networkidle", { timeout: 8000 }) + .catch(() => {}); + try { + const state = await context.storageState(); + fs.writeFileSync(storageStatePath, JSON.stringify(state, null, 2)); + if (VERBOSE) console.error("[pw] storageState saved (after prewarm)"); + } catch {} + } + + if (VERBOSE) console.error("[pw] fetching feed:", feedUrl); + const resp = await page.goto(feedUrl, { + waitUntil: "domcontentloaded", + timeout: 45000, + }); + if (!resp) throw new Error("[pw] No response for feed URL"); + + const status = resp.status(); + const headers = resp.headers(); + if (VERBOSE) console.error("[pw] status:", status, "headers:", headers); + + if (status !== 200) { + // Try one refresh cycle if stored cookies are stale + if (fs.existsSync(storageStatePath)) { + if (VERBOSE) + console.error("[pw] stale cookies? retrying with fresh prewarm…"); + await context.clearCookies(); + await context.clearPermissions(); + await page.goto(prewarmUrl, { + waitUntil: "domcontentloaded", + timeout: 45000, + }); + await page + .waitForLoadState("networkidle", { timeout: 8000 }) + .catch(() => {}); + const resp2 = await page.goto(feedUrl, { + waitUntil: "domcontentloaded", + timeout: 45000, + }); + if (resp2 && resp2.status() === 200) { + const xml2 = await resp2.text(); + try { + const state2 = await context.storageState(); + fs.writeFileSync(storageStatePath, JSON.stringify(state2, null, 2)); + if (VERBOSE) console.error("[pw] storageState refreshed"); + } catch {} + await browser.close(); + return xml2; + } + } + const e = new Error(`[pw] Feed request failed: ${status}`); + e.statusCode = status; + e.headers = headers; + throw e; + } + + const xml = await resp.text(); + try { + const state = await context.storageState(); + fs.writeFileSync(storageStatePath, JSON.stringify(state, null, 2)); + if (VERBOSE) console.error("[pw] storageState saved"); + } catch {} + + await browser.close(); + return xml; + } catch (e) { + try { + await browser.close(); + } catch {} + throw e; + } +} + +// ---------------- RSS/Atom → JSON Feed ---------------- async function parseRSS(source) { try { let xml; + if (source.startsWith("http://") || source.startsWith("https://")) { + // ORIGINAL PATH FIRST (unchanged behavior for Medium & others) try { xml = await fetchXmlFromUrl(source); } catch (err) { - if (err.code === "SELF_SIGNED_CERT_IN_CHAIN") { + // If the site is Vercel and challenged us, use the Playwright fallback + if (isVercelChallenge(err)) { + if (VERBOSE) + console.error( + "[main] Vercel 429 challenge → falling back to Playwright" + ); + xml = await fetchXmlWithPlaywright(source); + } else if (err && err.code === "SELF_SIGNED_CERT_IN_CHAIN") { + // original graceful handling return { version: "https://jsonfeed.org/version/1", title: "", @@ -45,10 +222,13 @@ async function parseRSS(source) { description: "", items: [], }; + } else { + // Not a Vercel case → bubble up (preserves original behavior) + throw err; } - throw err; } } else { + // Local file path (original behavior) xml = fs.readFileSync(source, "utf-8"); } @@ -73,7 +253,7 @@ async function parseRSS(source) { title: channel.title, home_page_url: channel.link, description: channel.description, - author: { name: channel.webMaster }, + author: channel.webMaster ? { name: channel.webMaster } : undefined, items: items.filter(Boolean).map((item) => ({ guid: item.guid, url: item.link, @@ -126,12 +306,11 @@ async function parseRSS(source) { } } -// Example usage: +// ---------------- Example usage ---------------- (async () => { const source = process.argv[2] || "example.xml"; // File path or URL passed via CLI const jsonOutput = await parseRSS(source); console.log(JSON.stringify(jsonOutput, null, 2)); })(); -// Export function if needed for external usage module.exports = { parseRSS }; diff --git a/yarn.lock b/yarn.lock index 60edf81da..de08d2910 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6259,6 +6259,11 @@ fs.realpath@^1.0.0: resolved "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw== +fsevents@2.3.2: + version "2.3.2" + resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a" + integrity sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA== + fsevents@~2.3.2: version "2.3.3" resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz#cac6407785d03675a2a5e1a5305c697b347d90d6" @@ -9605,6 +9610,20 @@ pkg-types@^2.0.1: exsolve "^1.0.7" pathe "^2.0.3" +playwright-core@1.55.0: + version "1.55.0" + resolved "https://registry.npmjs.org/playwright-core/-/playwright-core-1.55.0.tgz#ec8a9f8ef118afb3e86e0f46f1393e3bea32adf4" + integrity sha512-GvZs4vU3U5ro2nZpeiwyb0zuFaqb9sUiAJuyrWpcGouD8y9/HLgGbNRjIph7zU9D3hnPaisMl9zG9CgFi/biIg== + +playwright@^1.55.0: + version "1.55.0" + resolved "https://registry.npmjs.org/playwright/-/playwright-1.55.0.tgz#7aca7ac3ffd9e083a8ad8b2514d6f9ba401cc78b" + integrity sha512-sdCWStblvV1YU909Xqx0DhOjPZE4/5lJsIS84IfN9dAZfcl/CIZ5O8l3o0j7hPMjDvqoTF8ZUcc+i/GL5erstA== + dependencies: + playwright-core "1.55.0" + optionalDependencies: + fsevents "2.3.2" + plugin-sitemap-coveo@./plugin-sitemap-coveo: version "1.0.0" dependencies: