Merge pull request #103 from firecrawl/mog/parse-api

mogery · web-flow · commit 3c6ac28a0c7b · 2026-04-27T17:24:38.000+02:00
feat: parse command (ENG-4830)
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "firecrawl-cli",
-  "version": "1.15.2",
+  "version": "1.16.0",
   "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.",
   "main": "dist/index.js",
   "bin": {
diff --git a/skills/firecrawl-cli/SKILL.md b/skills/firecrawl-cli/SKILL.md
@@ -62,6 +62,7 @@ Follow this escalation pattern:
 | AI-powered data extraction  | `agent`               | Need structured data from complex sites                   |
 | Interact with a page        | `scrape` + `interact` | Content requires clicks, form fills, pagination, or login |
 | Download a site to files    | `download`            | Save an entire site as local files                        |
+| Parse a local file          | `parse`               | File on disk (PDF, DOCX, XLSX, etc.) — not a URL          |
 
 For detailed command reference, run `firecrawl <command> --help`.
 
@@ -85,6 +86,7 @@ For detailed command reference, run `firecrawl <command> --help`.
 - **AI-powered structured extraction from complex sites** -> [firecrawl-agent](../firecrawl-agent/SKILL.md)
 - **Clicks, forms, login, pagination, or post-scrape browser actions** -> [firecrawl-interact](../firecrawl-interact/SKILL.md)
 - **Downloading a site to local files** -> [firecrawl-download](../firecrawl-download/SKILL.md)
+- **Parsing a local file (PDF, DOCX, XLSX, HTML, etc.)** -> [firecrawl-parse](../firecrawl-parse/SKILL.md)
 - **Install, auth, or setup problems** -> [rules/install.md](rules/install.md)
 - **Output handling and safe file-reading patterns** -> [rules/security.md](rules/security.md)
 - **Integrating Firecrawl into an app, adding `FIRECRAWL_API_KEY` to `.env`, or choosing endpoint usage in product code** -> use the `firecrawl-build` skills (already installed alongside this CLI skill)
diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md
@@ -0,0 +1,61 @@
+---
+name: firecrawl-parse
+description: |
+  Efficiently extract and convert the contents of any local file—such as PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, or HTML—into clean, well-formatted markdown saved to disk. Use this skill whenever the user requests to parse, read, or extract information from a file on their computer, including phrases like “parse this PDF”, “convert this document”, “read this file”, “extract text from”, or when a local file path (not a URL) is provided. This skill offers advanced options like generating AI-powered summaries and answering questions based on the file's content. Prefer this tool over `scrape` when handling local files to deliver precise, structured outputs for downstream tasks.
+allowed-tools:
+  - Bash(firecrawl *)
+  - Bash(npx firecrawl *)
+---
+
+# firecrawl parse
+
+Turn a local document into clean markdown on disk. Supports **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML/HTM/XHTML**.
+
+## When to use
+
+- You have a file on disk (not a URL) and want its text as markdown
+- User drops a PDF/DOCX and asks what it says, or to summarize it
+- Use `scrape` instead when the source is a URL
+
+## Quick start
+
+Always save to `.firecrawl/` with `-o` — parsed docs can be hundreds of KB and blow up context if streamed to stdout. Add `.firecrawl/` to `.gitignore`.
+
+```bash
+mkdir -p .firecrawl
+
+# File → markdown
+firecrawl parse ./paper.pdf -o .firecrawl/paper.md
+
+# AI summary
+firecrawl parse ./paper.pdf -S -o .firecrawl/paper-summary.md
+
+# Ask a question about the doc
+firecrawl parse ./paper.pdf -Q "What are the main conclusions?" \
+  -o .firecrawl/paper-qa.md
+```
+
+Then `head`, `grep`, `rg` etc., or incrementally read the file - don't load the whole thing at once.
+
+## Options
+
+| Option                 | Description                             |
+| ---------------------- | --------------------------------------- |
+| `-S, --summary`        | AI-generated summary                    |
+| `-Q, --query <prompt>` | Ask a question about the parsed content |
+| `-o, --output <path>`  | Output file path — **always use this**  |
+| `-f, --format <fmt>`   | `markdown` (default), `html`, `summary` |
+| `--timeout <ms>`       | Timeout for the parse job               |
+| `--timing`             | Show request duration                   |
+
+## Tips
+
+- Quote paths with spaces: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`.
+- Max upload size: **50 MB** per file.
+- Credits: ~1 per PDF page; HTML is 1 flat.
+- Check `.firecrawl/` before re-parsing the same file.
+- To check your credit balance (recommended for batch processing and similar workflows), use the `firecrawl credit-usage` command.
+
+## See also
+
+- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea for URLs
diff --git a/src/commands/parse.ts b/src/commands/parse.ts
@@ -0,0 +1,239 @@
+/**
+ * Parse command implementation
+ *
+ * Uploads a local file to the Firecrawl /v2/parse endpoint and returns the
+ * parsed document in the requested format(s). Supported file types:
+ *   .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import type { FormatOption } from '@mendable/firecrawl-js';
+import type { ParseOptions, ParseResult } from '../types/parse';
+import type { ScrapeFormat } from '../types/scrape';
+import { getClient } from '../utils/client';
+import { getConfig, validateConfig } from '../utils/config';
+import { handleScrapeOutput } from '../utils/output';
+
+const DEFAULT_API_URL = 'https://api.firecrawl.dev';
+
+/** File extensions accepted by /v2/parse (mirrors the API controller). */
+const SUPPORTED_EXTENSIONS = new Set([
+  '.html',
+  '.htm',
+  '.pdf',
+  '.docx',
+  '.doc',
+  '.odt',
+  '.rtf',
+  '.xlsx',
+  '.xls',
+]);
+
+/**
+ * Best-effort content-type lookup so the API's kind detector has a hint
+ * even if the extension is ambiguous.
+ */
+const CONTENT_TYPE_BY_EXT: Record<string, string> = {
+  '.html': 'text/html',
+  '.htm': 'text/html',
+  '.pdf': 'application/pdf',
+  '.docx':
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+  '.doc': 'application/msword',
+  '.odt': 'application/vnd.oasis.opendocument.text',
+  '.rtf': 'application/rtf',
+  '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+  '.xls': 'application/vnd.ms-excel',
+};
+
+function outputTiming(
+  options: ParseOptions,
+  requestStartTime: number,
+  requestEndTime: number,
+  error?: Error | unknown
+): void {
+  if (!options.timing) return;
+
+  const duration = requestEndTime - requestStartTime;
+  const info: Record<string, string> = {
+    file: options.file,
+    requestTime: new Date(requestStartTime).toISOString(),
+    duration: `${duration}ms`,
+    status: error ? 'error' : 'success',
+  };
+  if (error) {
+    info.error = error instanceof Error ? error.message : 'Unknown error';
+  }
+  console.error('Timing:', JSON.stringify(info, null, 2));
+}
+
+/**
+ * Build the `formats` array sent to the API (mirrors scrape's behavior).
+ */
+function buildFormats(options: ParseOptions): FormatOption[] {
+  const formats: FormatOption[] = [];
+
+  if (options.formats && options.formats.length > 0) {
+    formats.push(...options.formats);
+  }
+
+  if (options.query) {
+    formats.push({ type: 'query', prompt: options.query } as any);
+  }
+
+  if (formats.length === 0) {
+    formats.push('markdown');
+  }
+
+  return formats;
+}
+
+/**
+ * Build the JSON `options` payload uploaded alongside the file.
+ */
+function buildOptionsPayload(options: ParseOptions): Record<string, unknown> {
+  const payload: Record<string, unknown> = {
+    formats: buildFormats(options),
+    integration: 'cli',
+  };
+
+  if (options.onlyMainContent !== undefined) {
+    payload.onlyMainContent = options.onlyMainContent;
+  }
+  if (options.includeTags && options.includeTags.length > 0) {
+    payload.includeTags = options.includeTags;
+  }
+  if (options.excludeTags && options.excludeTags.length > 0) {
+    payload.excludeTags = options.excludeTags;
+  }
+  if (options.timeout !== undefined) {
+    payload.timeout = options.timeout;
+  }
+  if (options.location) {
+    payload.location = options.location;
+  }
+
+  return payload;
+}
+
+/**
+ * Execute the parse command by POSTing a multipart upload to /v2/parse.
+ */
+export async function executeParse(
+  options: ParseOptions
+): Promise<ParseResult> {
+  const filePath = path.resolve(options.file);
+
+  if (!fs.existsSync(filePath)) {
+    return {
+      success: false,
+      error: `File not found: ${options.file}`,
+    };
+  }
+
+  const stat = fs.statSync(filePath);
+  if (!stat.isFile()) {
+    return {
+      success: false,
+      error: `Not a file: ${options.file}`,
+    };
+  }
+
+  const ext = path.extname(filePath).toLowerCase();
+  if (!SUPPORTED_EXTENSIONS.has(ext)) {
+    return {
+      success: false,
+      error:
+        `Unsupported file type "${ext || '(none)'}". ` +
+        `Supported extensions: ${[...SUPPORTED_EXTENSIONS].join(', ')}`,
+    };
+  }
+
+  // Ensure auth/url is resolved through the same config pipeline scrape uses.
+  if (options.apiKey || options.apiUrl) {
+    getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl });
+  }
+
+  const config = getConfig();
+  const apiKey = options.apiKey || config.apiKey;
+  validateConfig(apiKey);
+
+  const apiUrl = (options.apiUrl || config.apiUrl || DEFAULT_API_URL).replace(
+    /\/$/,
+    ''
+  );
+
+  const buffer = fs.readFileSync(filePath);
+  const filename = path.basename(filePath);
+  const contentType = CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream';
+
+  const form = new FormData();
+  form.append(
+    'file',
+    new Blob([new Uint8Array(buffer)], { type: contentType }),
+    filename
+  );
+  form.append('options', JSON.stringify(buildOptionsPayload(options)));
+
+  const requestStartTime = Date.now();
+
+  try {
+    const response = await fetch(`${apiUrl}/v2/parse`, {
+      method: 'POST',
+      headers: apiKey ? { Authorization: `Bearer ${apiKey}` } : undefined,
+      body: form,
+    });
+
+    const requestEndTime = Date.now();
+    outputTiming(options, requestStartTime, requestEndTime);
+
+    const payload = (await response.json().catch(() => ({}))) as any;
+
+    if (!response.ok || payload?.success === false) {
+      const message =
+        payload?.error ||
+        `HTTP ${response.status}: ${response.statusText || 'Request failed'}`;
+      return { success: false, error: message };
+    }
+
+    return {
+      success: true,
+      data: payload?.data ?? payload,
+    };
+  } catch (error) {
+    const requestEndTime = Date.now();
+    outputTiming(options, requestStartTime, requestEndTime, error);
+    return {
+      success: false,
+      error: error instanceof Error ? error.message : 'Unknown error occurred',
+    };
+  }
+}
+
+/**
+ * Handle parse command output. Reuses the scrape output formatter since the
+ * /v2/parse response shape matches /v2/scrape.
+ */
+export async function handleParseCommand(options: ParseOptions): Promise<void> {
+  const result = await executeParse(options);
+
+  if (options.query && result.success && result.data?.answer) {
+    const { writeOutput } = await import('../utils/output');
+    writeOutput(result.data.answer, options.output, !!options.output);
+    return;
+  }
+
+  const effectiveFormats: ScrapeFormat[] =
+    options.formats && options.formats.length > 0
+      ? [...options.formats]
+      : ['markdown'];
+
+  handleScrapeOutput(
+    result,
+    effectiveFormats,
+    options.output,
+    options.pretty,
+    options.json
+  );
+}
diff --git a/src/index.ts b/src/index.ts
diff --git a/src/types/parse.ts b/src/types/parse.ts

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "firecrawl-cli",`
`3`		`- "version": "1.15.2",`
	`3`	`+ "version": "1.16.0",`
`4`	`4`	`"description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.",`
`5`	`5`	`"main": "dist/index.js",`
`6`	`6`	`"bin": {`