|
| 1 | +/* eslint-disable @typescript-eslint/no-unused-expressions */ |
| 2 | +// Copyright (c) Microsoft Corporation. |
| 3 | +// Licensed under the MIT License. |
| 4 | + |
| 5 | +/** |
| 6 | + * GenAIScript supporting runtime |
| 7 | + * This module provides core functionality for text classification, data transformation, |
| 8 | + * PDF processing, and file system operations in the GenAIScript environment. |
| 9 | + */ |
| 10 | +import type { |
| 11 | + ChatGenerationContext, |
| 12 | + ParsePDFOptions, |
| 13 | + PromptGenerator, |
| 14 | + PromptGeneratorOptions, |
| 15 | + WorkspaceFile, |
| 16 | +} from "@genaiscript/core"; |
| 17 | + |
| 18 | +/** |
| 19 | + * Converts a PDF file to markdown format with intelligent formatting preservation. |
| 20 | + * |
| 21 | + * @param file - PDF file to convert. |
| 22 | + * @param options - Configuration options for PDF processing and markdown conversion, including instructions, context, and additional settings. The options can include rendering images, providing custom instructions, and specifying the context for processing. The text and images from the PDF are analyzed to ensure accurate markdown formatting. |
| 23 | + * @returns An object containing the original pages, rendered images, and markdown content for each page. |
| 24 | + */ |
| 25 | +export async function markdownifyPdf( |
| 26 | + file: WorkspaceFile, |
| 27 | + options?: PromptGeneratorOptions & |
| 28 | + Omit<ParsePDFOptions, "renderAsImage"> & { |
| 29 | + instructions?: string | PromptGenerator; |
| 30 | + ctx?: ChatGenerationContext; |
| 31 | + }, |
| 32 | +) { |
| 33 | + const { |
| 34 | + ctx = globalPromptContext.env.generator, |
| 35 | + label = `markdownify PDF`, |
| 36 | + model = "ocr", |
| 37 | + responseType = "markdown", |
| 38 | + instructions, |
| 39 | + ...rest |
| 40 | + } = options || {}; |
| 41 | + |
| 42 | + // extract text and render pages as images |
| 43 | + const { pages, images = [] } = await globalPromptContext.parsers.PDF(file, { |
| 44 | + ...rest, |
| 45 | + renderAsImage: true, |
| 46 | + }); |
| 47 | + const markdowns: string[] = []; |
| 48 | + for (let i = 0; i < pages.length; ++i) { |
| 49 | + const page = pages[i]; |
| 50 | + const image = images[i]; |
| 51 | + // mix of text and vision |
| 52 | + const res = await ctx.runPrompt( |
| 53 | + async (_) => { |
| 54 | + const previousPages = markdowns.slice(-2).join("\n\n"); |
| 55 | + if (previousPages.length) _.def("PREVIOUS_PAGES", previousPages); |
| 56 | + if (page) _.def("PAGE", page); |
| 57 | + if (image) _.defImages(image, { autoCrop: true, greyscale: true }); |
| 58 | + _.$`You are an expert at converting PDFs to markdown. |
| 59 | + |
| 60 | + ## Task |
| 61 | + Your task is to analyze the image and extract textual content in markdown format. |
| 62 | +
|
| 63 | + The image is a screenshot of the current page in the PDF document. |
| 64 | + We used pdfjs-dist to extract the text of the current page in <PAGE>, use it to help with the conversion. |
| 65 | + The text from the previous pages is in <PREVIOUS_PAGES>, use it to ensure consistency in the conversion. |
| 66 | +
|
| 67 | + ## Instructions |
| 68 | + - Ensure markdown text formatting for the extracted text is applied properly by analyzing the image. |
| 69 | + - Do not change any content in the original extracted text while applying markdown formatting and do not repeat the extracted text. |
| 70 | + - Preserve markdown text formatting if present such as horizontal lines, header levels, footers, bullet points, links/urls, or other markdown elements. |
| 71 | + - Extract source code snippets in code fences. |
| 72 | + - Do not omit any textual content from the markdown formatted extracted text. |
| 73 | + - Do not generate page breaks |
| 74 | + - Do not repeat the <PREVIOUS_PAGES> content. |
| 75 | + - Do not include any additional explanations or comments in the markdown formatted extracted text. |
| 76 | + `; |
| 77 | + if (image) globalPromptContext.$`- For images, generate a short alt-text description.`; |
| 78 | + if (typeof instructions === "string") _.$`${instructions}`; |
| 79 | + else if (typeof instructions === "function") await instructions(_); |
| 80 | + }, |
| 81 | + { |
| 82 | + ...rest, |
| 83 | + model, |
| 84 | + label: `${label}: page ${i + 1}`, |
| 85 | + responseType, |
| 86 | + system: ["system", "system.assistant"], |
| 87 | + }, |
| 88 | + ); |
| 89 | + if (res.error) throw new Error(res.error?.message); |
| 90 | + markdowns.push(res.text); |
| 91 | + } |
| 92 | + |
| 93 | + return { pages, images, markdowns }; |
| 94 | +} |
0 commit comments