codeforboston
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/bill-pdf-text-extraction.md‎
Lines changed: 63 additions & 0 deletions b/‎docs/bill-pdf-text-extraction.md‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎functions/package.json‎
Lines changed: 2 additions & 0 deletions b/‎functions/package.json‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎functions/src/bills/bills.test.ts‎
Lines changed: 82 additions & 0 deletions b/‎functions/src/bills/bills.test.ts‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎functions/src/bills/bills.ts‎
Lines changed: 11 additions & 3 deletions b/‎functions/src/bills/bills.ts‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎functions/src/bills/documentTextFallback.ts‎
Lines changed: 60 additions & 0 deletions b/‎functions/src/bills/documentTextFallback.ts‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎functions/src/bills/pdfText.test.ts‎
Lines changed: 98 additions & 0 deletions b/‎functions/src/bills/pdfText.test.ts‎
Lines changed: 98 additions & 0 deletions
@@ -88,6 +88,7 @@ cert.txt
 # lets each user define their own vscode settings
 .vscode/settings.json
 
+.serena/
 # local MCP server config (contains auth tokens)
 .mcp.json
 mcp-server/create-agent-key.ts
@@ -0,0 +1,63 @@
+# Bill PDF Text Extraction
+
+Some Massachusetts Legislature bill records have `content.DocumentText` set to
+null in the Document API even though the bill PDF contains embedded text. Maple
+now falls back to the official PDF at `/Bills/{court}/{billId}.pdf` when the API
+text is missing.
+
+## Extraction Scope
+
+The current extractor handles PDFs with embedded text. It does not perform OCR,
+so scanned or image-only PDFs are reported but not repaired.
+
+Known 194th General Court examples:
+
+- `H1`: large embedded-text PDF.
+- `H4787`: short embedded-text PDF.
+- `H5008`: ballot initiative embedded-text PDF.
+- `S2539`: regulatory/report-style embedded-text PDF.
+- `H18`: image-only/scanned PDF; no OCR support in this implementation.
+
+## Runtime Scraper Behavior
+
+The bill scraper first calls the MA Legislature Document API. If
+`DocumentText` is present, it stores the API response as before. If
+`DocumentText` is null or absent, the scraper downloads the PDF and tries to
+extract text with `pdf-parse`.
+
+Successful PDF extraction stores the result in the existing
+`content.DocumentText` field. Failed extraction leaves `DocumentText` absent and
+logs the extraction status.
+
+## Backfill Existing Bills
+
+Run the PDF text backfill in dry-run mode first:
+
+```sh
+yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --bills "H1 H18 H4787 H5008 S2539" --output ./bill-pdf-text-dry-run.csv
+```
+
+After reviewing the CSV, commit writes:
+
+```sh
+yarn firebase-admin run-script backfillBillPdfText --env dev -- --court 194 --commit true --output ./bill-pdf-text-dev.csv
+```
+
+The script only writes `content.DocumentText` and `fetchedAt` for bills that are
+missing `content.DocumentText`. Bills that already have text are skipped.
+
+## Summary And Topic Backfill
+
+Updating existing bill documents does not trigger the Python LLM function,
+because that function currently runs on document creation only. After committing
+PDF text, run the LLM backfill for the repaired bills:
+
+```sh
+python llm/backfill_summaries_runner.py --court 194 --bill-ids "H1 H4787 H5008 S2539" --output ./summaries-and-topics.csv
+```
+
+Use `--dry-run` to verify which rows would be processed without updating
+Firestore.
+
+`backfill_summaries.py` is the legacy immediate-run wrapper.
+`backfill_summaries_runner.py` is the import-safe CLI and test target.
@@ -30,6 +30,7 @@
     "luxon": "^2.3.1",
     "nanoid": "^3.3.2",
     "object-hash": "^3.0.0",
+    "pdf-parse": "1.1.1",
     "runtypes": "6.6.0",
     "ssl-root-cas": "^1.3.1",
     "typesense": "^1.2.2",
@@ -41,6 +42,7 @@
     "@types/jsdom": "^21.1.7",
     "@types/luxon": "^2.0.9",
     "@types/object-hash": "^2.2.1",
+    "@types/pdf-parse": "1.1.5",
     "copyfiles": "^2.4.1",
     "firebase-functions-test": "^0.3.3",
     "firebase-tools": "^13.18.0",
 
@@ -0,0 +1,82 @@
+jest.mock("../malegislature", () => ({
+  getDocument: jest.fn(),
+  getDocumentPdf: jest.fn()
+}))
+jest.mock("./pdfText", () => ({
+  extractBillTextFromPdf: jest.fn()
+}))
+
+import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
+import { extractBillTextFromPdf } from "./pdfText"
+
+const mockedApi = jest.requireMock("../malegislature") as {
+  getDocument: jest.Mock
+  getDocumentPdf: jest.Mock
+}
+const mockedExtractBillTextFromPdf =
+  extractBillTextFromPdf as jest.MockedFunction<typeof extractBillTextFromPdf>
+
+describe("getDocumentWithPdfTextFallback", () => {
+  beforeEach(() => {
+    jest.resetAllMocks()
+  })
+
+  it("does not fetch a PDF when API text is present", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: "API text" })
+
+    await expect(
+      getDocumentWithPdfTextFallback(194, "H1")
+    ).resolves.toMatchObject({
+      content: { DocumentText: "API text" },
+      documentTextSource: "api"
+    })
+    expect(mockedApi.getDocumentPdf).not.toHaveBeenCalled()
+  })
+
+  it("sets DocumentText when PDF extraction succeeds", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: null })
+    mockedApi.getDocumentPdf.mockResolvedValue(Buffer.from("pdf"))
+    mockedExtractBillTextFromPdf.mockResolvedValue({
+      status: "extracted",
+      text: "PDF text",
+      pageCount: 1,
+      charCount: 7
+    })
+
+    await expect(
+      getDocumentWithPdfTextFallback(194, "H1")
+    ).resolves.toMatchObject({
+      content: { DocumentText: "PDF text" },
+      documentTextSource: "pdf",
+      pdfTextExtraction: { status: "extracted" }
+    })
+  })
+
+  it("leaves DocumentText absent when PDF has no text", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: null })
+    mockedApi.getDocumentPdf.mockResolvedValue(Buffer.from("pdf"))
+    mockedExtractBillTextFromPdf.mockResolvedValue({
+      status: "no-text",
+      pageCount: 1,
+      charCount: 0
+    })
+
+    const result = await getDocumentWithPdfTextFallback(194, "H18")
+
+    expect(result.content).not.toHaveProperty("DocumentText")
+    expect(result.pdfTextExtraction).toMatchObject({ status: "no-text" })
+  })
+
+  it("leaves DocumentText absent when PDF fetch fails", async () => {
+    mockedApi.getDocument.mockResolvedValue({ DocumentText: null })
+    mockedApi.getDocumentPdf.mockRejectedValue(new Error("not found"))
+
+    const result = await getDocumentWithPdfTextFallback(194, "H18")
+
+    expect(result.content).not.toHaveProperty("DocumentText")
+    expect(result.pdfTextExtraction).toMatchObject({
+      status: "fetch-error",
+      error: "not found"
+    })
+  })
+})
@@ -1,9 +1,13 @@
 import { isString } from "lodash"
+import { logger } from "firebase-functions"
 import { logFetchError } from "../common"
 import * as api from "../malegislature"
 import { createScraper } from "../scraper"
+import { getDocumentWithPdfTextFallback } from "./documentTextFallback"
 import { Bill, MISSING_TIMESTAMP } from "./types"
 
+export { getDocumentWithPdfTextFallback } from "./documentTextFallback"
+
 /**
  * There are around 8000 documents. With 8 batches per day, 20 parallel
  * scrapers, and 50 documents per batch, we will process all documents once per
@@ -18,7 +22,8 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } =
     fetchBatchTimeout: 240,
     startBatchTimeout: 240,
     fetchResource: async (court: number, id: string, current) => {
-      const content = await api.getDocument({ id, court })
+      const { content, pdfTextExtraction } =
+        await getDocumentWithPdfTextFallback(court, id)
       const history = await api
         .getBillHistory(court, id)
         .catch(logFetchError("bill history", id))
@@ -28,8 +33,11 @@ export const { fetchBatch: fetchBillBatch, startBatches: startBillBatches } =
         .getSimilarBills(court, id)
         .catch(logFetchError("similar bills", id))
         .then(bills => bills?.map(b => b.BillNumber).filter(isString) ?? [])
-      if (content.DocumentText == null) {
-        delete content.DocumentText
+
+      if (content.DocumentText == null && pdfTextExtraction) {
+        logger.info(
+          `No bill text extracted from PDF for ${court}/${id}: ${pdfTextExtraction.status}`
+        )
       }
 
       const resource: Partial<Bill> = {
 
@@ -0,0 +1,60 @@
+import * as api from "../malegislature"
+import { extractBillTextFromPdf, PdfTextExtractionResult } from "./pdfText"
+
+export type DocumentTextFallbackResult = {
+  content: any
+  documentTextSource?: "api" | "pdf"
+  pdfTextExtraction?: PdfTextExtractionResult | PdfFetchFailure
+}
+
+type PdfFetchFailure = {
+  status: "fetch-error"
+  charCount: 0
+  pageCount?: undefined
+  error: string
+}
+
+export async function getDocumentWithPdfTextFallback(
+  court: number,
+  id: string
+): Promise<DocumentTextFallbackResult> {
+  const content = await api.getDocument({ id, court })
+
+  if (content.DocumentText != null) {
+    return {
+      content,
+      documentTextSource: "api"
+    }
+  }
+
+  delete content.DocumentText
+
+  let pdf: Buffer
+  try {
+    pdf = await api.getDocumentPdf({ id, court })
+  } catch (e) {
+    return {
+      content,
+      pdfTextExtraction: {
+        status: "fetch-error",
+        charCount: 0,
+        error: e instanceof Error ? e.message : String(e)
+      }
+    }
+  }
+
+  const pdfTextExtraction = await extractBillTextFromPdf(pdf)
+  if (pdfTextExtraction.status === "extracted") {
+    content.DocumentText = pdfTextExtraction.text
+    return {
+      content,
+      documentTextSource: "pdf",
+      pdfTextExtraction
+    }
+  }
+
+  return {
+    content,
+    pdfTextExtraction
+  }
+}
@@ -0,0 +1,98 @@
+const mockedPdfParse = jest.fn()
+
+jest.mock("pdf-parse/lib/pdf-parse", () => mockedPdfParse)
+
+import { extractBillTextFromPdf, normalizeExtractedBillText } from "./pdfText"
+
+describe("normalizeExtractedBillText", () => {
+  it("trims and collapses noisy whitespace", () => {
+    expect(
+      normalizeExtractedBillText(" \r\n  Section   1.   Text\t\t here.  \n\n\n")
+    ).toBe("Section 1. Text here.")
+  })
+
+  it("removes standalone page counters", () => {
+    expect(
+      normalizeExtractedBillText("1 of 3\nHOUSE No. 1\n-- 2 of 3 --\nBill text")
+    ).toBe("HOUSE No. 1\nBill text")
+  })
+
+  it("preserves substantive bill text", () => {
+    const text =
+      "The General Laws are hereby amended.\nSection 2. This act shall take effect."
+
+    expect(normalizeExtractedBillText(text)).toBe(text)
+  })
+})
+
+describe("extractBillTextFromPdf", () => {
+  beforeEach(() => {
+    mockedPdfParse.mockReset()
+  })
+
+  it("returns extracted when text is long enough", async () => {
+    mockedPdfParse.mockResolvedValue({
+      text: "An Act " + "with enough extracted text. ".repeat(10),
+      numpages: 2,
+      numrender: 2,
+      info: {},
+      metadata: {},
+      version: "default"
+    })
+
+    const result = await extractBillTextFromPdf(Buffer.from("pdf"))
+
+    expect(result.status).toBe("extracted")
+    expect(result.pageCount).toBe(2)
+    expect(result.text).toContain("An Act")
+  })
+
+  it("returns no-text for empty extraction", async () => {
+    mockedPdfParse.mockResolvedValue({
+      text: " \n\t ",
+      numpages: 1,
+      numrender: 1,
+      info: {},
+      metadata: {},
+      version: "default"
+    })
+
+    await expect(
+      extractBillTextFromPdf(Buffer.from("pdf"))
+    ).resolves.toMatchObject({
+      status: "no-text",
+      charCount: 0,
+      pageCount: 1
+    })
+  })
+
+  it("returns too-short for tiny extraction", async () => {
+    mockedPdfParse.mockResolvedValue({
+      text: "short text",
+      numpages: 1,
+      numrender: 1,
+      info: {},
+      metadata: {},
+      version: "default"
+    })
+
+    await expect(
+      extractBillTextFromPdf(Buffer.from("pdf"))
+    ).resolves.toMatchObject({
+      status: "too-short",
+      text: "short text",
+      pageCount: 1
+    })
+  })
+
+  it("returns parse-error when parser throws", async () => {
+    mockedPdfParse.mockRejectedValue(new Error("bad pdf"))
+
+    await expect(
+      extractBillTextFromPdf(Buffer.from("pdf"))
+    ).resolves.toMatchObject({
+      status: "parse-error",
+      error: "bad pdf"
+    })
+  })
+})