managedcode
diff --git a/‎README.md‎
Lines changed: 31 additions & 5 deletions b/‎README.md‎
Lines changed: 31 additions & 5 deletions
diff --git a/‎src/MarkItDown.Cli/MarkItDown.Cli.csproj‎
Lines changed: 2 additions & 2 deletions b/‎src/MarkItDown.Cli/MarkItDown.Cli.csproj‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/MarkItDown.Core/Converters/DocxConverter.cs‎
Lines changed: 290 additions & 0 deletions b/‎src/MarkItDown.Core/Converters/DocxConverter.cs‎
Lines changed: 290 additions & 0 deletions
diff --git a/‎src/MarkItDown.Core/Converters/HtmlConverter.cs‎
Lines changed: 23 additions & 1 deletion b/‎src/MarkItDown.Core/Converters/HtmlConverter.cs‎
Lines changed: 23 additions & 1 deletion
@@ -21,11 +21,11 @@ A modern C# .NET library for converting various document formats (HTML, PDF, DOC
 |--------|-----------|---------|-------------|
 | **HTML** | `.html`, `.htm` | ✅ Supported | Full HTML to Markdown conversion |
 | **Plain Text** | `.txt`, `.md`, `.json` | ✅ Supported | Direct text processing |
-| **PDF** | `.pdf` | 🚧 Planned | Adobe PDF documents |
-| **Word** | `.docx` | 🚧 Planned | Microsoft Word documents |
-| **Excel** | `.xlsx` | 🚧 Planned | Microsoft Excel spreadsheets |
-| **PowerPoint** | `.pptx` | 🚧 Planned | Microsoft PowerPoint presentations |
-| **Images** | `.jpg`, `.png`, `.gif` | 🚧 Planned | OCR-based text extraction |
+| **PDF** | `.pdf` | ✅ Supported | Adobe PDF documents with text extraction |
+| **Word** | `.docx` | ✅ Supported | Microsoft Word documents with formatting |
+| **Excel** | `.xlsx` | ✅ Supported | Microsoft Excel spreadsheets as tables |
+| **PowerPoint** | `.pptx` | ✅ Supported | Microsoft PowerPoint presentations |
+| **Images** | `.jpg`, `.png`, `.gif`, `.bmp`, `.tiff`, `.webp` | ✅ Supported | OCR-based text extraction |
 
 ### HTML Conversion Features
 - Headers (H1-H6) → Markdown headers
@@ -38,6 +38,23 @@ A modern C# .NET library for converting various document formats (HTML, PDF, DOC
 - Code blocks and inline code
 - Blockquotes
 
+### PDF Conversion Features
+- Text extraction with page separation
+- Header detection based on formatting
+- List item recognition
+- Title extraction from document content
+
+### Office Documents (DOCX/XLSX/PPTX)
+- **Word (.docx)**: Headers, paragraphs, tables, bold/italic formatting
+- **Excel (.xlsx)**: Spreadsheet data as Markdown tables with sheet organization
+- **PowerPoint (.pptx)**: Slide-by-slide content with title recognition
+
+### Image OCR Features
+- Support for multiple formats: JPEG, PNG, GIF, BMP, TIFF, WebP
+- Text extraction using Tesseract OCR
+- Header detection and paragraph formatting
+- Graceful fallback when OCR fails
+
 ## 🚀 Quick Start
 
 ### Installation
@@ -59,6 +76,15 @@ dotnet add package MarkItDown
 - .NET 8.0 SDK or later
 - Compatible with .NET 8.0+ projects (ready for .NET 9)
 
+### Optional Dependencies for Advanced Features
+- **PDF Support**: Included via iText7 (automatically installed)
+- **Office Documents**: Included via DocumentFormat.OpenXml (automatically installed)
+- **Image OCR**: Requires Tesseract OCR data files
+  - Install Tesseract: `apt-get install tesseract-ocr` (Linux) or `brew install tesseract` (macOS)
+  - Set `TESSDATA_PREFIX` environment variable to Tesseract data directory if needed
+
+> **Note**: All dependencies except Tesseract OCR data are automatically managed via NuGet packages.
+
 ## 💻 Usage
 
 ### Basic API Usage
 
@@ -17,8 +17,8 @@
 
   <ItemGroup>
     <PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
-    <PackageReference Include="Microsoft.Extensions.Logging" Version="8.0.0" />
-    <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="8.0.0" />
+    <PackageReference Include="Microsoft.Extensions.Logging" Version="8.0.1" />
+    <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="8.0.1" />
   </ItemGroup>
 
 </Project>
@@ -0,0 +1,290 @@
+using DocumentFormat.OpenXml.Packaging;
+using DocumentFormat.OpenXml.Wordprocessing;
+using System.Text;
+
+namespace MarkItDown.Core.Converters;
+
+/// <summary>
+/// Converter for Microsoft Word (.docx) files to Markdown using DocumentFormat.OpenXml.
+/// </summary>
+public sealed class DocxConverter : IDocumentConverter
+{
+    private static readonly HashSet<string> AcceptedExtensions = new(StringComparer.OrdinalIgnoreCase)
+    {
+        ".docx"
+    };
+
+    private static readonly HashSet<string> AcceptedMimeTypes = new(StringComparer.OrdinalIgnoreCase)
+    {
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    };
+
+    public int Priority => 210; // Between PDF and plain text
+
+    public bool AcceptsInput(StreamInfo streamInfo)
+    {
+        var mimeType = streamInfo.MimeType?.ToLowerInvariant() ?? string.Empty;
+        var extension = streamInfo.Extension?.ToLowerInvariant();
+
+        // Check the extension
+        if (extension is not null && AcceptedExtensions.Contains(extension))
+            return true;
+
+        // Check the mimetype
+        if (AcceptedMimeTypes.Contains(mimeType))
+            return true;
+
+        return false;
+    }
+
+    public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
+    {
+        if (!AcceptsInput(streamInfo))
+            return false;
+
+        // Validate ZIP/DOCX header if we have access to the stream
+        if (stream.CanSeek && stream.Length > 4)
+        {
+            var originalPosition = stream.Position;
+            try
+            {
+                stream.Position = 0;
+                var buffer = new byte[4];
+                var bytesRead = stream.Read(buffer, 0, 4);
+                stream.Position = originalPosition;
+
+                if (bytesRead == 4)
+                {
+                    // Check for ZIP file signature (DOCX files are ZIP archives)
+                    return buffer[0] == 0x50 && buffer[1] == 0x4B && 
+                           (buffer[2] == 0x03 || buffer[2] == 0x05 || buffer[2] == 0x07) && 
+                           (buffer[3] == 0x04 || buffer[3] == 0x06 || buffer[3] == 0x08);
+                }
+            }
+            catch
+            {
+                stream.Position = originalPosition;
+            }
+        }
+
+        return true;
+    }
+
+    public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
+    {
+        try
+        {
+            // Reset stream position
+            if (stream.CanSeek)
+                stream.Position = 0;
+
+            var markdown = await ExtractTextFromDocxAsync(stream, cancellationToken);
+            var title = ExtractTitle(markdown);
+
+            return new DocumentConverterResult(markdown, title);
+        }
+        catch (Exception ex) when (!(ex is MarkItDownException))
+        {
+            throw new FileConversionException($"Failed to convert DOCX file: {ex.Message}", ex);
+        }
+    }
+
+    private static async Task<string> ExtractTextFromDocxAsync(Stream stream, CancellationToken cancellationToken)
+    {
+        var result = new StringBuilder();
+
+        await Task.Run(() =>
+        {
+            using var wordDocument = WordprocessingDocument.Open(stream, false);
+            var body = wordDocument.MainDocumentPart?.Document?.Body;
+
+            if (body != null)
+            {
+                ProcessBodyElements(body, result, cancellationToken);
+            }
+        }, cancellationToken);
+
+        return result.ToString().Trim();
+    }
+
+    private static void ProcessBodyElements(Body body, StringBuilder result, CancellationToken cancellationToken)
+    {
+        foreach (var element in body.Elements())
+        {
+            cancellationToken.ThrowIfCancellationRequested();
+
+            switch (element)
+            {
+                case Paragraph paragraph:
+                    ProcessParagraph(paragraph, result);
+                    break;
+                case Table table:
+                    ProcessTable(table, result);
+                    break;
+                // Add more element types as needed
+            }
+        }
+    }
+
+    private static void ProcessParagraph(Paragraph paragraph, StringBuilder result)
+    {
+        var paragraphText = new StringBuilder();
+        var isHeading = false;
+        var headingLevel = 0;
+
+        // Check paragraph properties for heading styles
+        var paragraphProperties = paragraph.ParagraphProperties;
+        if (paragraphProperties?.ParagraphStyleId?.Val?.Value != null)
+        {
+            var styleId = paragraphProperties.ParagraphStyleId.Val.Value.ToLowerInvariant();
+            if (styleId.StartsWith("heading"))
+            {
+                isHeading = true;
+                if (int.TryParse(styleId.Replace("heading", ""), out var level))
+                {
+                    headingLevel = level;
+                }
+            }
+        }
+
+        // Process runs within the paragraph
+        foreach (var run in paragraph.Elements<Run>())
+        {
+            var runProperties = run.RunProperties;
+            var currentBold = runProperties?.Bold != null;
+            var currentItalic = runProperties?.Italic != null;
+
+            foreach (var textElement in run.Elements())
+            {
+                switch (textElement)
+                {
+                    case Text text:
+                        var textContent = text.Text;
+                        
+                        // Apply formatting
+                        if (currentBold && !isHeading)
+                            textContent = $"**{textContent}**";
+                        if (currentItalic && !isHeading)
+                            textContent = $"*{textContent}*";
+                            
+                        paragraphText.Append(textContent);
+                        break;
+                    case TabChar:
+                        paragraphText.Append("\t");
+                        break;
+                    case Break:
+                        paragraphText.AppendLine();
+                        break;
+                }
+            }
+        }
+
+        var finalText = paragraphText.ToString();
+        
+        if (!string.IsNullOrWhiteSpace(finalText))
+        {
+            if (isHeading && headingLevel > 0)
+            {
+                result.Append(new string('#', Math.Min(headingLevel, 6)));
+                result.Append(' ');
+                result.AppendLine(finalText.Trim());
+                result.AppendLine();
+            }
+            else
+            {
+                result.AppendLine(finalText.Trim());
+                result.AppendLine();
+            }
+        }
+    }
+
+    private static void ProcessTable(Table table, StringBuilder result)
+    {
+        var rows = table.Elements<TableRow>().ToList();
+        if (rows.Count == 0)
+            return;
+
+        result.AppendLine();
+        
+        var isFirstRow = true;
+        foreach (var row in rows)
+        {
+            var cells = row.Elements<TableCell>().ToList();
+            if (cells.Count == 0)
+                continue;
+
+            result.Append("|");
+            foreach (var cell in cells)
+            {
+                var cellText = ExtractCellText(cell);
+                result.Append($" {cellText.Replace("|", "\\|").Trim()} |");
+            }
+            result.AppendLine();
+
+            // Add header separator after first row
+            if (isFirstRow)
+            {
+                result.Append("|");
+                for (int i = 0; i < cells.Count; i++)
+                {
+                    result.Append(" --- |");
+                }
+                result.AppendLine();
+                isFirstRow = false;
+            }
+        }
+        
+        result.AppendLine();
+    }
+
+    private static string ExtractCellText(TableCell cell)
+    {
+        var cellText = new StringBuilder();
+        
+        foreach (var paragraph in cell.Elements<Paragraph>())
+        {
+            foreach (var run in paragraph.Elements<Run>())
+            {
+                foreach (var text in run.Elements<Text>())
+                {
+                    cellText.Append(text.Text);
+                }
+            }
+            
+            if (cellText.Length > 0)
+                cellText.Append(" ");
+        }
+        
+        return cellText.ToString().Trim();
+    }
+
+    private static string? ExtractTitle(string markdown)
+    {
+        if (string.IsNullOrWhiteSpace(markdown))
+            return null;
+
+        var lines = markdown.Split('\n', StringSplitOptions.RemoveEmptyEntries);
+        
+        // Look for the first heading
+        foreach (var line in lines.Take(10))
+        {
+            var trimmedLine = line.Trim();
+            if (trimmedLine.StartsWith('#'))
+            {
+                return trimmedLine.TrimStart('#').Trim();
+            }
+        }
+
+        // If no heading found, use the first substantial line
+        foreach (var line in lines.Take(5))
+        {
+            var trimmedLine = line.Trim();
+            if (trimmedLine.Length > 5 && trimmedLine.Length < 100)
+            {
+                return trimmedLine;
+            }
+        }
+
+        return null;
+    }
+}
@@ -276,7 +276,7 @@ private static void ConvertElementToMarkdown(HtmlNode element, StringBuilder mar
             case "blockquote":
                 markdown.AppendLine();
                 markdown.Append("> ");
-                ConvertChildrenToMarkdown(element, markdown, indentLevel);
+                ConvertBlockquoteChildren(element, markdown, indentLevel);
                 markdown.AppendLine();
                 markdown.AppendLine();
                 break;
@@ -318,6 +318,28 @@ private static void ConvertChildrenToMarkdown(HtmlNode element, StringBuilder ma
         }
     }
 
+    private static void ConvertBlockquoteChildren(HtmlNode element, StringBuilder markdown, int indentLevel)
+    {
+        if (element?.ChildNodes == null) return;
+        
+        var isFirst = true;
+        foreach (var child in element.ChildNodes)
+        {
+            if (child.NodeType == HtmlNodeType.Element && child.Name.ToLowerInvariant() == "p")
+            {
+                // For paragraph children in blockquotes, don't add the leading newline
+                if (!isFirst)
+                    markdown.AppendLine();
+                ConvertChildrenToMarkdown(child, markdown, indentLevel);
+                isFirst = false;
+            }
+            else
+            {
+                ConvertNodeToMarkdown(child, markdown, indentLevel);
+            }
+        }
+    }
+
     private static void ConvertTableToMarkdown(HtmlNode table, StringBuilder markdown)
     {
         var rows = table.SelectNodes(".//tr");