Skip to content

Commit 2711e16

Browse files
authored
Merge pull request #6 from managedcode/copilot/fix-5
Add support for PDF, DOCX, XLSX, PPTX, and Image formats with OCR
2 parents 57005dc + 47f02f3 commit 2711e16

13 files changed

Lines changed: 1536 additions & 24 deletions

README.md

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ A modern C# .NET library for converting various document formats (HTML, PDF, DOC
2121
|--------|-----------|---------|-------------|
2222
| **HTML** | `.html`, `.htm` | ✅ Supported | Full HTML to Markdown conversion |
2323
| **Plain Text** | `.txt`, `.md`, `.json` | ✅ Supported | Direct text processing |
24-
| **PDF** | `.pdf` | 🚧 Planned | Adobe PDF documents |
25-
| **Word** | `.docx` | 🚧 Planned | Microsoft Word documents |
26-
| **Excel** | `.xlsx` | 🚧 Planned | Microsoft Excel spreadsheets |
27-
| **PowerPoint** | `.pptx` | 🚧 Planned | Microsoft PowerPoint presentations |
28-
| **Images** | `.jpg`, `.png`, `.gif` | 🚧 Planned | OCR-based text extraction |
24+
| **PDF** | `.pdf` | ✅ Supported | Adobe PDF documents with text extraction |
25+
| **Word** | `.docx` | ✅ Supported | Microsoft Word documents with formatting |
26+
| **Excel** | `.xlsx` | ✅ Supported | Microsoft Excel spreadsheets as tables |
27+
| **PowerPoint** | `.pptx` | ✅ Supported | Microsoft PowerPoint presentations |
28+
| **Images** | `.jpg`, `.png`, `.gif`, `.bmp`, `.tiff`, `.webp` | ✅ Supported | OCR-based text extraction |
2929

3030
### HTML Conversion Features
3131
- Headers (H1-H6) → Markdown headers
@@ -38,6 +38,23 @@ A modern C# .NET library for converting various document formats (HTML, PDF, DOC
3838
- Code blocks and inline code
3939
- Blockquotes
4040

41+
### PDF Conversion Features
42+
- Text extraction with page separation
43+
- Header detection based on formatting
44+
- List item recognition
45+
- Title extraction from document content
46+
47+
### Office Documents (DOCX/XLSX/PPTX)
48+
- **Word (.docx)**: Headers, paragraphs, tables, bold/italic formatting
49+
- **Excel (.xlsx)**: Spreadsheet data as Markdown tables with sheet organization
50+
- **PowerPoint (.pptx)**: Slide-by-slide content with title recognition
51+
52+
### Image OCR Features
53+
- Support for multiple formats: JPEG, PNG, GIF, BMP, TIFF, WebP
54+
- Text extraction using Tesseract OCR
55+
- Header detection and paragraph formatting
56+
- Graceful fallback when OCR fails
57+
4158
## 🚀 Quick Start
4259

4360
### Installation
@@ -59,6 +76,15 @@ dotnet add package MarkItDown
5976
- .NET 8.0 SDK or later
6077
- Compatible with .NET 8.0+ projects (ready for .NET 9)
6178

79+
### Optional Dependencies for Advanced Features
80+
- **PDF Support**: Included via iText7 (automatically installed)
81+
- **Office Documents**: Included via DocumentFormat.OpenXml (automatically installed)
82+
- **Image OCR**: Requires Tesseract OCR data files
83+
- Install Tesseract: `apt-get install tesseract-ocr` (Linux) or `brew install tesseract` (macOS)
84+
- Set `TESSDATA_PREFIX` environment variable to Tesseract data directory if needed
85+
86+
> **Note**: All dependencies except Tesseract OCR data are automatically managed via NuGet packages.
87+
6288
## 💻 Usage
6389

6490
### Basic API Usage

src/MarkItDown.Cli/MarkItDown.Cli.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717

1818
<ItemGroup>
1919
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
20-
<PackageReference Include="Microsoft.Extensions.Logging" Version="8.0.0" />
21-
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="8.0.0" />
20+
<PackageReference Include="Microsoft.Extensions.Logging" Version="8.0.1" />
21+
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="8.0.1" />
2222
</ItemGroup>
2323

2424
</Project>
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
using DocumentFormat.OpenXml.Packaging;
2+
using DocumentFormat.OpenXml.Wordprocessing;
3+
using System.Text;
4+
5+
namespace MarkItDown.Core.Converters;
6+
7+
/// <summary>
8+
/// Converter for Microsoft Word (.docx) files to Markdown using DocumentFormat.OpenXml.
9+
/// </summary>
10+
public sealed class DocxConverter : IDocumentConverter
11+
{
12+
private static readonly HashSet<string> AcceptedExtensions = new(StringComparer.OrdinalIgnoreCase)
13+
{
14+
".docx"
15+
};
16+
17+
private static readonly HashSet<string> AcceptedMimeTypes = new(StringComparer.OrdinalIgnoreCase)
18+
{
19+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
20+
};
21+
22+
public int Priority => 210; // Between PDF and plain text
23+
24+
public bool AcceptsInput(StreamInfo streamInfo)
25+
{
26+
var mimeType = streamInfo.MimeType?.ToLowerInvariant() ?? string.Empty;
27+
var extension = streamInfo.Extension?.ToLowerInvariant();
28+
29+
// Check the extension
30+
if (extension is not null && AcceptedExtensions.Contains(extension))
31+
return true;
32+
33+
// Check the mimetype
34+
if (AcceptedMimeTypes.Contains(mimeType))
35+
return true;
36+
37+
return false;
38+
}
39+
40+
public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
41+
{
42+
if (!AcceptsInput(streamInfo))
43+
return false;
44+
45+
// Validate ZIP/DOCX header if we have access to the stream
46+
if (stream.CanSeek && stream.Length > 4)
47+
{
48+
var originalPosition = stream.Position;
49+
try
50+
{
51+
stream.Position = 0;
52+
var buffer = new byte[4];
53+
var bytesRead = stream.Read(buffer, 0, 4);
54+
stream.Position = originalPosition;
55+
56+
if (bytesRead == 4)
57+
{
58+
// Check for ZIP file signature (DOCX files are ZIP archives)
59+
return buffer[0] == 0x50 && buffer[1] == 0x4B &&
60+
(buffer[2] == 0x03 || buffer[2] == 0x05 || buffer[2] == 0x07) &&
61+
(buffer[3] == 0x04 || buffer[3] == 0x06 || buffer[3] == 0x08);
62+
}
63+
}
64+
catch
65+
{
66+
stream.Position = originalPosition;
67+
}
68+
}
69+
70+
return true;
71+
}
72+
73+
public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
74+
{
75+
try
76+
{
77+
// Reset stream position
78+
if (stream.CanSeek)
79+
stream.Position = 0;
80+
81+
var markdown = await ExtractTextFromDocxAsync(stream, cancellationToken);
82+
var title = ExtractTitle(markdown);
83+
84+
return new DocumentConverterResult(markdown, title);
85+
}
86+
catch (Exception ex) when (!(ex is MarkItDownException))
87+
{
88+
throw new FileConversionException($"Failed to convert DOCX file: {ex.Message}", ex);
89+
}
90+
}
91+
92+
private static async Task<string> ExtractTextFromDocxAsync(Stream stream, CancellationToken cancellationToken)
93+
{
94+
var result = new StringBuilder();
95+
96+
await Task.Run(() =>
97+
{
98+
using var wordDocument = WordprocessingDocument.Open(stream, false);
99+
var body = wordDocument.MainDocumentPart?.Document?.Body;
100+
101+
if (body != null)
102+
{
103+
ProcessBodyElements(body, result, cancellationToken);
104+
}
105+
}, cancellationToken);
106+
107+
return result.ToString().Trim();
108+
}
109+
110+
private static void ProcessBodyElements(Body body, StringBuilder result, CancellationToken cancellationToken)
111+
{
112+
foreach (var element in body.Elements())
113+
{
114+
cancellationToken.ThrowIfCancellationRequested();
115+
116+
switch (element)
117+
{
118+
case Paragraph paragraph:
119+
ProcessParagraph(paragraph, result);
120+
break;
121+
case Table table:
122+
ProcessTable(table, result);
123+
break;
124+
// Add more element types as needed
125+
}
126+
}
127+
}
128+
129+
private static void ProcessParagraph(Paragraph paragraph, StringBuilder result)
130+
{
131+
var paragraphText = new StringBuilder();
132+
var isHeading = false;
133+
var headingLevel = 0;
134+
135+
// Check paragraph properties for heading styles
136+
var paragraphProperties = paragraph.ParagraphProperties;
137+
if (paragraphProperties?.ParagraphStyleId?.Val?.Value != null)
138+
{
139+
var styleId = paragraphProperties.ParagraphStyleId.Val.Value.ToLowerInvariant();
140+
if (styleId.StartsWith("heading"))
141+
{
142+
isHeading = true;
143+
if (int.TryParse(styleId.Replace("heading", ""), out var level))
144+
{
145+
headingLevel = level;
146+
}
147+
}
148+
}
149+
150+
// Process runs within the paragraph
151+
foreach (var run in paragraph.Elements<Run>())
152+
{
153+
var runProperties = run.RunProperties;
154+
var currentBold = runProperties?.Bold != null;
155+
var currentItalic = runProperties?.Italic != null;
156+
157+
foreach (var textElement in run.Elements())
158+
{
159+
switch (textElement)
160+
{
161+
case Text text:
162+
var textContent = text.Text;
163+
164+
// Apply formatting
165+
if (currentBold && !isHeading)
166+
textContent = $"**{textContent}**";
167+
if (currentItalic && !isHeading)
168+
textContent = $"*{textContent}*";
169+
170+
paragraphText.Append(textContent);
171+
break;
172+
case TabChar:
173+
paragraphText.Append("\t");
174+
break;
175+
case Break:
176+
paragraphText.AppendLine();
177+
break;
178+
}
179+
}
180+
}
181+
182+
var finalText = paragraphText.ToString();
183+
184+
if (!string.IsNullOrWhiteSpace(finalText))
185+
{
186+
if (isHeading && headingLevel > 0)
187+
{
188+
result.Append(new string('#', Math.Min(headingLevel, 6)));
189+
result.Append(' ');
190+
result.AppendLine(finalText.Trim());
191+
result.AppendLine();
192+
}
193+
else
194+
{
195+
result.AppendLine(finalText.Trim());
196+
result.AppendLine();
197+
}
198+
}
199+
}
200+
201+
private static void ProcessTable(Table table, StringBuilder result)
202+
{
203+
var rows = table.Elements<TableRow>().ToList();
204+
if (rows.Count == 0)
205+
return;
206+
207+
result.AppendLine();
208+
209+
var isFirstRow = true;
210+
foreach (var row in rows)
211+
{
212+
var cells = row.Elements<TableCell>().ToList();
213+
if (cells.Count == 0)
214+
continue;
215+
216+
result.Append("|");
217+
foreach (var cell in cells)
218+
{
219+
var cellText = ExtractCellText(cell);
220+
result.Append($" {cellText.Replace("|", "\\|").Trim()} |");
221+
}
222+
result.AppendLine();
223+
224+
// Add header separator after first row
225+
if (isFirstRow)
226+
{
227+
result.Append("|");
228+
for (int i = 0; i < cells.Count; i++)
229+
{
230+
result.Append(" --- |");
231+
}
232+
result.AppendLine();
233+
isFirstRow = false;
234+
}
235+
}
236+
237+
result.AppendLine();
238+
}
239+
240+
private static string ExtractCellText(TableCell cell)
241+
{
242+
var cellText = new StringBuilder();
243+
244+
foreach (var paragraph in cell.Elements<Paragraph>())
245+
{
246+
foreach (var run in paragraph.Elements<Run>())
247+
{
248+
foreach (var text in run.Elements<Text>())
249+
{
250+
cellText.Append(text.Text);
251+
}
252+
}
253+
254+
if (cellText.Length > 0)
255+
cellText.Append(" ");
256+
}
257+
258+
return cellText.ToString().Trim();
259+
}
260+
261+
private static string? ExtractTitle(string markdown)
262+
{
263+
if (string.IsNullOrWhiteSpace(markdown))
264+
return null;
265+
266+
var lines = markdown.Split('\n', StringSplitOptions.RemoveEmptyEntries);
267+
268+
// Look for the first heading
269+
foreach (var line in lines.Take(10))
270+
{
271+
var trimmedLine = line.Trim();
272+
if (trimmedLine.StartsWith('#'))
273+
{
274+
return trimmedLine.TrimStart('#').Trim();
275+
}
276+
}
277+
278+
// If no heading found, use the first substantial line
279+
foreach (var line in lines.Take(5))
280+
{
281+
var trimmedLine = line.Trim();
282+
if (trimmedLine.Length > 5 && trimmedLine.Length < 100)
283+
{
284+
return trimmedLine;
285+
}
286+
}
287+
288+
return null;
289+
}
290+
}

src/MarkItDown.Core/Converters/HtmlConverter.cs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ private static void ConvertElementToMarkdown(HtmlNode element, StringBuilder mar
276276
case "blockquote":
277277
markdown.AppendLine();
278278
markdown.Append("> ");
279-
ConvertChildrenToMarkdown(element, markdown, indentLevel);
279+
ConvertBlockquoteChildren(element, markdown, indentLevel);
280280
markdown.AppendLine();
281281
markdown.AppendLine();
282282
break;
@@ -318,6 +318,28 @@ private static void ConvertChildrenToMarkdown(HtmlNode element, StringBuilder ma
318318
}
319319
}
320320

321+
private static void ConvertBlockquoteChildren(HtmlNode element, StringBuilder markdown, int indentLevel)
322+
{
323+
if (element?.ChildNodes == null) return;
324+
325+
var isFirst = true;
326+
foreach (var child in element.ChildNodes)
327+
{
328+
if (child.NodeType == HtmlNodeType.Element && child.Name.ToLowerInvariant() == "p")
329+
{
330+
// For paragraph children in blockquotes, don't add the leading newline
331+
if (!isFirst)
332+
markdown.AppendLine();
333+
ConvertChildrenToMarkdown(child, markdown, indentLevel);
334+
isFirst = false;
335+
}
336+
else
337+
{
338+
ConvertNodeToMarkdown(child, markdown, indentLevel);
339+
}
340+
}
341+
}
342+
321343
private static void ConvertTableToMarkdown(HtmlNode table, StringBuilder markdown)
322344
{
323345
var rows = table.SelectNodes(".//tr");

0 commit comments

Comments
 (0)