From b7b41b9653395649e314fb853a25bb7441819e97 Mon Sep 17 00:00:00 2001
From: xuzhenqi <xuzq09@chinatelecom.cn>
Date: Thu, 28 May 2026 11:50:36 +0800
Subject: [PATCH 1/2] Add Anthropic article download command

---
 cli-manifest.json          |  42 ++++
 clis/anthropic/download.js | 394 +++++++++++++++++++++++++++++++++++++
 2 files changed, 436 insertions(+)
 create mode 100644 clis/anthropic/download.js

diff --git a/cli-manifest.json b/cli-manifest.json
index 944958737..d4d84e444 100644
--- a/cli-manifest.json
+++ b/cli-manifest.json
@@ -1447,6 +1447,48 @@
     "sourceFile": "amazon/search.js",
     "navigateBefore": false
   },
+  {
+    "site": "anthropic",
+    "name": "download",
+    "description": "Download Anthropic articles as local Markdown with YAML frontmatter and images",
+    "access": "read",
+    "domain": "www.anthropic.com",
+    "strategy": "public",
+    "browser": true,
+    "args": [
+      {
+        "name": "url",
+        "type": "str",
+        "required": true,
+        "help": "Anthropic article URL under www.anthropic.com"
+      },
+      {
+        "name": "output",
+        "type": "str",
+        "default": "./anthropic-articles",
+        "required": false,
+        "help": "Output directory"
+      },
+      {
+        "name": "download-images",
+        "type": "boolean",
+        "default": true,
+        "required": false,
+        "help": "Download cover and article images locally"
+      }
+    ],
+    "columns": [
+      "title",
+      "date",
+      "status",
+      "images",
+      "saved",
+      "assets"
+    ],
+    "type": "js",
+    "modulePath": "anthropic/download.js",
+    "sourceFile": "anthropic/download.js"
+  },
   {
     "site": "antigravity",
     "name": "dump",
diff --git a/clis/anthropic/download.js b/clis/anthropic/download.js
new file mode 100644
index 000000000..601995c2b
--- /dev/null
+++ b/clis/anthropic/download.js
@@ -0,0 +1,394 @@
+/**
+ * Anthropic article download - export articles to local Markdown.
+ *
+ * Usage:
+ *   opencli anthropic download --url "https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback" --output ./anthropic
+ */
+import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
+import TurndownService from 'turndown';
+import { gfm } from 'turndown-plugin-gfm';
+import { cli, Strategy } from '@jackwener/opencli/registry';
+import { ArgumentError, CommandExecutionError, getErrorMessage } from '@jackwener/opencli/errors';
+
+const DEFAULT_OUTPUT = './anthropic-articles';
+const IMAGE_CONCURRENCY = 5;
+const USER_AGENT = 'Mozilla/5.0 (compatible; OpenCLI Anthropic article archiver)';
+
+const MONTHS = new Map([
+    ['jan', '01'], ['january', '01'],
+    ['feb', '02'], ['february', '02'],
+    ['mar', '03'], ['march', '03'],
+    ['apr', '04'], ['april', '04'],
+    ['may', '05'],
+    ['jun', '06'], ['june', '06'],
+    ['jul', '07'], ['july', '07'],
+    ['aug', '08'], ['august', '08'],
+    ['sep', '09'], ['sept', '09'], ['september', '09'],
+    ['oct', '10'], ['october', '10'],
+    ['nov', '11'], ['november', '11'],
+    ['dec', '12'], ['december', '12'],
+]);
+
+function boolish(value) {
+    if (value === true) return true;
+    if (value === false) return false;
+    if (typeof value === 'string') return ['1', 'true', 'yes', 'on'].includes(value.toLowerCase());
+    return !!value;
+}
+
+function normalizeAnthropicUrl(value) {
+    const raw = String(value || '').trim();
+    if (!raw) {
+        throw new ArgumentError('Missing --url', 'Example: opencli anthropic download --url "https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback"');
+    }
+    let parsed;
+    try {
+        parsed = new URL(raw);
+    } catch {
+        throw new ArgumentError(`Invalid Anthropic article URL: ${raw}`);
+    }
+    if (!['http:', 'https:'].includes(parsed.protocol)) {
+        throw new ArgumentError('Anthropic article URL must use http or https');
+    }
+    if (parsed.hostname !== 'www.anthropic.com') {
+        throw new ArgumentError(
+            `Unsupported Anthropic URL: ${raw}`,
+            'Use an article URL under https://www.anthropic.com/',
+        );
+    }
+    parsed.protocol = 'https:';
+    parsed.hash = '';
+    return parsed.toString();
+}
+
+function sanitizeFilename(name, maxLength = 120) {
+    return String(name)
+        .replace(/[<>:"/\\|?*\x00-\x1f]/g, '_')
+        .replace(/\s+/g, '_')
+        .replace(/_+/g, '_')
+        .replace(/^_|_$/g, '')
+        .slice(0, maxLength);
+}
+
+function parseAnthropicDate(value) {
+    const raw = String(value || '').replace(/^Published\s+/i, '').trim();
+    const iso = raw.match(/\b(\d{4})-(\d{2})-(\d{2})\b/);
+    if (iso) return `${iso[1]}-${iso[2]}-${iso[3]}`;
+
+    const match = raw.match(/\b([A-Za-z]+)\s+(\d{1,2}),\s*(\d{4})\b/);
+    if (!match) return '';
+    const month = MONTHS.get(match[1].toLowerCase());
+    if (!month) return '';
+    return `${match[3]}-${month}-${String(match[2]).padStart(2, '0')}`;
+}
+
+function extractDateFromMarkdown(markdown) {
+    const match = String(markdown || '').match(/(?:^|\n)(?:Published\s+)?([A-Za-z]+\s+\d{1,2},\s*\d{4})(?:\n|$)/);
+    return parseAnthropicDate(match?.[1] || '');
+}
+
+function createTurndown() {
+    const td = new TurndownService({
+        headingStyle: 'atx',
+        codeBlockStyle: 'fenced',
+        bulletListMarker: '-',
+    });
+    td.use(gfm);
+    td.remove(['script', 'style', 'noscript']);
+    td.addRule('lineBreak', {
+        filter: 'br',
+        replacement: () => '\n',
+    });
+    td.addRule('ignoreBase64Images', {
+        filter: (node) => {
+            if (node.nodeName !== 'IMG') return false;
+            const src = node.getAttribute?.('src') || '';
+            return src.startsWith('data:');
+        },
+        replacement: () => '',
+    });
+    return td;
+}
+
+function cleanMarkdown(markdown, title) {
+    let output = String(markdown || '')
+        .replace(/\u00a0/g, ' ')
+        .replace(/[ \t]+$/gm, '')
+        .replace(/\n{3,}/g, '\n\n')
+        .trim();
+
+    const escapedTitle = title.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+    output = output.replace(new RegExp(`^#\\s+${escapedTitle}\\s*\\n+`, 'i'), '');
+    return output.trim();
+}
+
+function yamlScalar(value) {
+    return JSON.stringify(String(value ?? ''));
+}
+
+function buildFrontmatter(metadata) {
+    const lines = ['---'];
+    for (const [key, value] of Object.entries(metadata)) {
+        if (value === '' || value === undefined || value === null) continue;
+        if (typeof value === 'number' || typeof value === 'boolean') {
+            lines.push(`${key}: ${value}`);
+        } else {
+            lines.push(`${key}: ${yamlScalar(value)}`);
+        }
+    }
+    lines.push('---');
+    return `${lines.join('\n')}\n\n`;
+}
+
+function detectExtension(url, contentType = '') {
+    const content = contentType.split(';')[0].trim().toLowerCase();
+    const byType = {
+        'image/jpeg': 'jpg',
+        'image/png': 'png',
+        'image/gif': 'gif',
+        'image/webp': 'webp',
+        'image/svg+xml': 'svg',
+        'image/avif': 'avif',
+    };
+    if (byType[content]) return byType[content];
+
+    try {
+        const ext = path.extname(new URL(url).pathname).replace('.', '').toLowerCase();
+        if (/^[a-z0-9]{2,5}$/.test(ext)) return ext;
+    } catch {
+        // Fall through.
+    }
+    return 'jpg';
+}
+
+async function downloadImage(url, destinationBase, sourceUrl) {
+    const response = await fetch(url, {
+        headers: {
+            'User-Agent': USER_AGENT,
+            'Referer': sourceUrl,
+        },
+    });
+    if (!response.ok) {
+        throw new Error(`HTTP ${response.status}`);
+    }
+    const ext = detectExtension(url, response.headers.get('content-type') || '');
+    const destination = `${destinationBase}.${ext}`;
+    const bytes = Buffer.from(await response.arrayBuffer());
+    await fs.writeFile(destination, bytes);
+    return destination;
+}
+
+async function downloadImages(urls, assetsDir, assetsDirName, sourceUrl) {
+    const urlMap = new Map();
+    const uniqueUrls = [...new Set(urls.filter(Boolean))];
+    await fs.mkdir(assetsDir, { recursive: true });
+
+    for (let i = 0; i < uniqueUrls.length; i += IMAGE_CONCURRENCY) {
+        const batch = uniqueUrls.slice(i, i + IMAGE_CONCURRENCY);
+        await Promise.all(batch.map(async (url, indexInBatch) => {
+            const imageIndex = i + indexInBatch;
+            const stem = imageIndex === 0 ? 'cover' : `img_${String(imageIndex).padStart(3, '0')}`;
+            try {
+                const destination = await downloadImage(url, path.join(assetsDir, stem), sourceUrl);
+                urlMap.set(url, `${assetsDirName}/${path.basename(destination)}`);
+            } catch {
+                // Keep the remote URL in Markdown when a single image download fails.
+            }
+        }));
+    }
+
+    return urlMap;
+}
+
+function replaceImageLinks(markdown, urlMap) {
+    return markdown.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, alt, url) => {
+        return urlMap.has(url) ? `![${alt}](${urlMap.get(url)})` : match;
+    });
+}
+
+function buildExtractAnthropicArticleJs() {
+    return `
+      (() => {
+        const text = (el) => (el?.textContent || '').replace(/\\s+/g, ' ').trim();
+        const meta = (selector) => document.querySelector(selector)?.getAttribute('content')?.trim() || '';
+        const absolutize = (value) => {
+          if (!value || value.startsWith('data:') || value.startsWith('javascript:') || value.startsWith('#')) return '';
+          try { return new URL(value, location.href).href; } catch { return ''; }
+        };
+        const pickSrcset = (srcset) => {
+          const candidates = String(srcset || '').split(',').map((part) => {
+            const [url, width] = part.trim().split(/\\s+/);
+            return { url, score: Number.parseInt(width, 10) || 0 };
+          }).filter((item) => item.url);
+          candidates.sort((a, b) => b.score - a.score);
+          return candidates[0]?.url || '';
+        };
+        const chooseRoot = () => {
+          const direct = Array.from(document.querySelectorAll('article, main'))
+            .sort((a, b) => text(b).length - text(a).length)[0];
+          if (direct && text(direct).length > 500) return direct;
+          const candidates = Array.from(document.querySelectorAll('section, div'))
+            .filter((el) => el.querySelector('h1'))
+            .sort((a, b) => text(b).length - text(a).length);
+          return candidates[0] || document.body;
+        };
+        const title = text(document.querySelector('h1'))
+          || meta('meta[property="og:title"]')
+          || (document.title || '').replace(/\\s*[|/].*$/, '').trim()
+          || 'untitled';
+        const dateText = meta('meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"]')
+          || ((document.body?.innerText || '').match(/Published\\s+[A-Za-z]+\\s+\\d{1,2},\\s*\\d{4}/)?.[0] || '');
+        const root = chooseRoot().cloneNode(true);
+
+        root.querySelectorAll([
+          'script',
+          'style',
+          'noscript',
+          'nav',
+          'header',
+          'footer',
+          'aside',
+          'form',
+          'button',
+          '[aria-hidden="true"]',
+          '[class*="newsletter"]',
+          '[class*="cookie"]',
+          '[class*="share"]'
+        ].join(',')).forEach((el) => el.remove());
+
+        let removedTitle = false;
+        const titleText = title.toLowerCase();
+        Array.from(root.querySelectorAll('a, p, div, span, time, h1')).forEach((el) => {
+          const value = text(el);
+          const lower = value.toLowerCase();
+          const hasMedia = !!el.querySelector('img, picture, video, audio, iframe');
+          if (el.tagName === 'H1' && lower === titleText && !removedTitle) {
+            el.remove();
+            removedTitle = true;
+            return;
+          }
+          if (hasMedia) return;
+          if (value === 'Engineering at Anthropic' || /^Published\\s+[A-Za-z]+\\s+\\d{1,2},\\s*\\d{4}$/.test(value)) {
+            el.remove();
+          }
+        });
+
+        const imageUrls = [];
+        const seen = new Set();
+        root.querySelectorAll('img').forEach((img) => {
+          const raw = img.getAttribute('src')
+            || img.getAttribute('data-src')
+            || img.getAttribute('data-original')
+            || pickSrcset(img.getAttribute('srcset') || img.getAttribute('data-srcset'));
+          const absolute = absolutize(raw);
+          if (!absolute) {
+            img.remove();
+            return;
+          }
+          img.setAttribute('src', absolute);
+          if (!seen.has(absolute)) {
+            seen.add(absolute);
+            imageUrls.push(absolute);
+          }
+        });
+
+        const coverUrl = absolutize(meta('meta[property="og:image"], meta[name="twitter:image"]')) || imageUrls[0] || '';
+        if (coverUrl && !seen.has(coverUrl)) imageUrls.unshift(coverUrl);
+
+        return {
+          title,
+          dateText,
+          author: meta('meta[name="author"], meta[property="article:author"]') || 'Anthropic',
+          description: meta('meta[name="description"], meta[property="og:description"]'),
+          coverUrl,
+          imageUrls,
+          contentHtml: root.innerHTML
+        };
+      })()
+    `;
+}
+
+async function downloadAnthropicArticle(page, args) {
+    const url = normalizeAnthropicUrl(args.url);
+    const output = String(args.output || DEFAULT_OUTPUT);
+    const shouldDownloadImages = boolish(args['download-images']);
+
+    await page.goto(url, { waitUntil: 'load', settleMs: 3000 });
+    const data = await page.evaluate(buildExtractAnthropicArticleJs()).catch((error) => {
+        throw new CommandExecutionError(`Failed to extract Anthropic article: ${getErrorMessage(error)}`);
+    });
+
+    if (!data?.title || !data?.contentHtml) {
+        throw new CommandExecutionError('Anthropic article content was not found', 'The page loaded, but no article root could be extracted.');
+    }
+
+    let markdown = cleanMarkdown(createTurndown().turndown(data.contentHtml), data.title);
+    const date = parseAnthropicDate(data.dateText) || extractDateFromMarkdown(markdown);
+    const datePrefix = date || new Date().toISOString().slice(0, 10);
+    const basename = `${datePrefix}-${sanitizeFilename(data.title)}`;
+    const outputDir = path.resolve(output);
+    const assetsDirName = `${basename}_assets`;
+    const assetsDir = path.join(outputDir, assetsDirName);
+    const filePath = path.join(outputDir, `${basename}.md`);
+    let cover = '';
+    let imageCount = 0;
+
+    await fs.mkdir(outputDir, { recursive: true });
+
+    if (shouldDownloadImages && Array.isArray(data.imageUrls) && data.imageUrls.length > 0) {
+        const urlMap = await downloadImages(data.imageUrls, assetsDir, assetsDirName, url);
+        markdown = replaceImageLinks(markdown, urlMap);
+        cover = data.coverUrl ? urlMap.get(data.coverUrl) || '' : '';
+        imageCount = urlMap.size;
+    }
+
+    const frontmatter = buildFrontmatter({
+        title: data.title,
+        date,
+        author: data.author || 'Anthropic',
+        site: 'Anthropic',
+        source_url: url,
+        description: data.description,
+        cover,
+        downloaded_at: new Date().toISOString(),
+        image_count: imageCount,
+    });
+
+    await fs.writeFile(filePath, `${frontmatter}# ${data.title}\n\n${markdown}\n`, 'utf8');
+
+    return [{
+        title: data.title,
+        date: date || '-',
+        status: 'success',
+        images: imageCount,
+        saved: filePath,
+        assets: shouldDownloadImages ? assetsDir : '',
+    }];
+}
+
+export const anthropicDownloadCommand = cli({
+    site: 'anthropic',
+    name: 'download',
+    access: 'read',
+    description: 'Download Anthropic articles as local Markdown with YAML frontmatter and images',
+    domain: 'www.anthropic.com',
+    strategy: Strategy.PUBLIC,
+    browser: true,
+    args: [
+        { name: 'url', required: true, help: 'Anthropic article URL under www.anthropic.com' },
+        { name: 'output', default: DEFAULT_OUTPUT, help: 'Output directory' },
+        { name: 'download-images', type: 'boolean', default: true, help: 'Download cover and article images locally' },
+    ],
+    columns: ['title', 'date', 'status', 'images', 'saved', 'assets'],
+    func: downloadAnthropicArticle,
+});
+
+export const __test__ = {
+    buildExtractAnthropicArticleJs,
+    cleanMarkdown,
+    extractDateFromMarkdown,
+    normalizeAnthropicUrl,
+    parseAnthropicDate,
+    sanitizeFilename,
+};

From 2b8ca2ea3386241a0fa45cf43006646c6f1418ca Mon Sep 17 00:00:00 2001
From: xuzhenqi <xuzq09@chinatelecom.cn>
Date: Thu, 28 May 2026 12:12:26 +0800
Subject: [PATCH 2/2] Document Anthropic adapter

---
 README.md                          |  1 +
 README.zh-CN.md                    |  1 +
 docs/.vitepress/config.mts         |  1 +
 docs/adapters/browser/anthropic.md | 54 ++++++++++++++++++++++++++++++
 docs/adapters/index.md             |  1 +
 5 files changed, 58 insertions(+)
 create mode 100644 docs/adapters/browser/anthropic.md

diff --git a/README.md b/README.md
index 00659994b..751211168 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,7 @@ OpenCLI supports downloading images, videos, and articles from supported platfor
 | **xiaoyuzhou** | Audio, Transcript | Downloads episode audio and transcript JSON/text with local credentials |
 | **zhihu** | Articles (Markdown) | Exports with optional image download |
 | **weixin** | Articles (Markdown) | WeChat Official Account articles |
+| **anthropic** | Articles (Markdown) | Anthropic articles with frontmatter and local image assets |
 
 For video downloads, install `yt-dlp` first: `brew install yt-dlp`
 
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 891c9e28a..cedbfe6e0 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -199,6 +199,7 @@ OpenCLI 支持从各平台下载图片、视频和文章。
 | **小宇宙** | 音频、转录 | 使用本地凭证下载单集音频和转录 JSON / 文本 |
 | **知乎** | 文章（Markdown） | 导出文章，可选下载图片到本地 |
 | **微信公众号** | 文章（Markdown） | 导出微信公众号文章为 Markdown |
+| **Anthropic** | 文章（Markdown） | 导出 Anthropic 文章，包含 frontmatter 和本地图片资源 |
 | **豆瓣** | 图片 | 下载电影条目的海报 / 剧照图片 |
 
 ### 前置依赖
diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts
index c8d553b21..383fc10d5 100644
--- a/docs/.vitepress/config.mts
+++ b/docs/.vitepress/config.mts
@@ -50,6 +50,7 @@ export default defineConfig({
               text: 'Browser Adapters',
               collapsed: false,
               items: [
+                { text: 'Anthropic', link: '/adapters/browser/anthropic' },
                 { text: 'Twitter / X', link: '/adapters/browser/twitter' },
                 { text: 'Reddit', link: '/adapters/browser/reddit' },
                 { text: 'Tieba', link: '/adapters/browser/tieba' },
diff --git a/docs/adapters/browser/anthropic.md b/docs/adapters/browser/anthropic.md
new file mode 100644
index 000000000..c2da2ea43
--- /dev/null
+++ b/docs/adapters/browser/anthropic.md
@@ -0,0 +1,54 @@
+# Anthropic
+
+**Mode**: Browser / **Domain**: `www.anthropic.com`
+
+## Commands
+
+| Command | Description |
+|---------|-------------|
+| `opencli anthropic download --url <url>` | Download an Anthropic article as Markdown with YAML frontmatter and local images |
+
+## Usage Examples
+
+```bash
+# Export an Anthropic news article to Markdown
+opencli anthropic download \
+  --url "https://www.anthropic.com/news/claude-haiku-4-5" \
+  --output ./anthropic-articles
+
+# Export an engineering article
+opencli anthropic download \
+  --url "https://www.anthropic.com/engineering/building-effective-agents" \
+  --output ./anthropic-articles
+
+# Export without downloading images
+opencli anthropic download \
+  --url "https://www.anthropic.com/news/claude-haiku-4-5" \
+  --download-images false
+```
+
+## Output
+
+`download` writes one Markdown file and, when image download is enabled, a sibling assets directory:
+
+- `YYYY-MM-DD-title.md` - Markdown with YAML frontmatter and article body
+- `YYYY-MM-DD-title_assets/` - cover and body images referenced by relative paths
+
+The frontmatter includes:
+
+- `title`
+- `date`
+- `author`
+- `site`
+- `source_url`
+- `description`
+- `cover`
+- `downloaded_at`
+- `image_count`
+
+The command supports article URLs under `https://www.anthropic.com/`, including engineering, news, and research/index-style pages.
+
+## Prerequisites
+
+- Chrome running
+- [Browser Bridge extension](/guide/browser-bridge) installed
diff --git a/docs/adapters/index.md b/docs/adapters/index.md
index a0057329e..377503617 100644
--- a/docs/adapters/index.md
+++ b/docs/adapters/index.md
@@ -6,6 +6,7 @@ Run `opencli list` for the live registry.
 
 | Site                                     | Commands                                                                                                                                                                                                                                                         | Mode                                |
 | ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
+| **[anthropic](./browser/anthropic.md)**     | `download`                                                                                                                                                                                                                                                       | Browser                             |
 | **[twitter](./browser/twitter.md)**         | `trending` `bookmarks` `profile` `search` `timeline` `thread` `following` `followers` `notifications` `post` `reply` `delete` `like` `likes` `lists` `article` `follow` `unfollow` `bookmark` `unbookmark` `download` `accept` `reply-dm` `block` `unblock` `hide-reply` | 🔐 Browser                          |
 | **[reddit](./browser/reddit.md)**           | `hot` `frontpage` `popular` `search` `subreddit` `read` `user` `user-posts` `user-comments` `upvote` `save` `comment` `reply` `subscribe` `saved` `upvoted`                                                                                                      | 🔐 Browser                          |
 | **[tieba](./browser/tieba.md)**             | `hot` `posts` `search` `read`                                                                                                                                                                                                                                    | 🔐 Browser                          |