markitdownnet/src/MarkItDownNet/MarkItDownConverter.cs at e4ecc9e065ef1e7e170e7c3472758cd16fef25e2 · mapo80/markitdownnet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Markdig;
using Serilog;
using Tesseract;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using PDFtoImage;
using SkiaSharp;

namespace MarkItDownNet;

/// <summary>Main entry point for converting documents to markdown and bounding boxes.</summary>
public class MarkItDownConverter
{
    private readonly MarkItDownOptions _options;
    private readonly ILogger _logger;

    public MarkItDownConverter(MarkItDownOptions? options = null, ILogger? logger = null)
    {
        _options = options ?? new MarkItDownOptions();
        _logger = logger ?? Log.Logger;
    }

    /// <summary>Convert the input file based on the provided mime type.</summary>
    public async Task<MarkItDownResult> ConvertAsync(string path, string mimeType, CancellationToken cancellationToken = default)
    {
        if (string.IsNullOrWhiteSpace(path))
        {
            throw new ArgumentException("Path is required", nameof(path));
        }

        cancellationToken.ThrowIfCancellationRequested();

        return mimeType switch
        {
            "application/pdf" => await Task.Run(() => ProcessPdf(path, cancellationToken), cancellationToken),
            var m when m.StartsWith("image/") => await Task.Run(() => ProcessImage(path, cancellationToken), cancellationToken),
            _ => throw new NotSupportedException($"Unsupported mime type '{mimeType}'.")
        };
    }

    private MarkItDownResult ProcessPdf(string path, CancellationToken ct)
    {
        using var stream = File.OpenRead(path);
        using var document = PdfDocument.Open(stream);
        var pages = new List<Page>();
        var lines = new List<Line>();
        var words = new List<Word>();

        foreach (var page in document.GetPages())
        {
            ct.ThrowIfCancellationRequested();
            pages.Add(new Page(page.Number, page.Width, page.Height));

            var pageWords = page.GetWords()
                .Select(w => new Word(page.Number, w.Text, BoundingBox.FromPdf(w.BoundingBox, page.Width, page.Height)))
                .ToList();

            words.AddRange(pageWords);

            foreach (var lineWords in GroupWordsIntoLines(pageWords))
            {
                var text = string.Join(" ", lineWords.Select(w => w.Text));
                var union = Union(lineWords.Select(w => w.BBox));
                lines.Add(new Line(page.Number, text, union));
            }
        }

        // If there are not enough words, fall back to OCR
        if (words.Count < _options.MinimumNativeWordThreshold)
        {
            _logger.Information("Native text too small ({Count}), attempting OCR fallback", words.Count);
            return ProcessPdfWithOcr(path, ct);
        }

        var markdown = BuildMarkdown(lines);
        return new MarkItDownResult(markdown, pages, lines, words);
    }

    private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct)
    {
        var pages = new List<Page>();
        var lines = new List<Line>();
        var words = new List<Word>();

        // Rasterize PDF into images using PDFtoImage
        var renderOptions = new RenderOptions { Dpi = _options.PdfRasterDpi };
        using var stream = File.OpenRead(path);
        foreach (var bitmap in Conversion.ToImages(stream, leaveOpen: false, password: null, renderOptions))
        {
            ct.ThrowIfCancellationRequested();
            using (bitmap)
            {
                pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height));
                using var image = SKImage.FromBitmap(bitmap);
                using var data = image.Encode(SKEncodedImageFormat.Png, 100);
                using var pix = Pix.LoadFromMemory(data.ToArray());
                var result = ProcessPix(pix, pages.Count, ct);
                lines.AddRange(result.lines);
                words.AddRange(result.words);
            }
        }

        var markdown = BuildMarkdown(lines);
        return new MarkItDownResult(markdown, pages, lines, words);
    }

    private MarkItDownResult ProcessImage(string path, CancellationToken ct)
    {
        using var pix = Pix.LoadFromFile(path);
        var (lines, words) = ProcessPix(pix, 1, ct);
        var pages = new List<Page> { new Page(1, pix.Width, pix.Height) };
        var markdown = BuildMarkdown(lines);
        return new MarkItDownResult(markdown, pages, lines, words);
    }

    private (List<Line> lines, List<Word> words) ProcessPix(Pix pix, int pageNumber, CancellationToken ct)
    {
        var lines = new List<Line>();
        var words = new List<Word>();
        using var engine = new TesseractEngine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default);
        using var page = engine.Process(pix);
        using var iter = page.GetIterator();
        iter.Begin();
        do
        {
            ct.ThrowIfCancellationRequested();

            if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine) &&
                iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out var rectLine))
            {
                var text = iter.GetText(PageIteratorLevel.TextLine)?.Trim() ?? string.Empty;
                if (!string.IsNullOrEmpty(text))
                {
                    lines.Add(new Line(pageNumber, text, Normalize(rectLine, pix.Width, pix.Height)));
                }
            }

            if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var rectWord))
            {
                var wText = iter.GetText(PageIteratorLevel.Word)?.Trim() ?? string.Empty;
                if (!string.IsNullOrEmpty(wText))
                {
                    words.Add(new Word(pageNumber, wText, Normalize(rectWord, pix.Width, pix.Height)));
                }
            }
        } while (iter.Next(PageIteratorLevel.Word));

        return (lines, words);
    }

    private static BoundingBox Normalize(Rect rect, int width, int height)
    {
        return new BoundingBox((double)rect.X1 / width, (double)rect.Y1 / height, (double)rect.Width / width, (double)rect.Height / height);
    }

    private static IEnumerable<IEnumerable<Word>> GroupWordsIntoLines(IReadOnlyList<Word> words)
    {
        const double tolerance = 0.02; // normalized units
        var result = new List<List<Word>>();
        var sorted = words.OrderBy(w => w.BBox.Y).ThenBy(w => w.BBox.X).ToList();

        var current = new List<Word>();
        double? currentTop = null;
        foreach (var w in sorted)
        {
            if (currentTop == null || Math.Abs(w.BBox.Y - currentTop.Value) <= tolerance)
            {
                currentTop = w.BBox.Y;
                current.Add(w);
            }
            else
            {
                result.Add(current);
                current = new List<Word> { w };
                currentTop = w.BBox.Y;
            }
        }
        if (current.Count > 0)
        {
            result.Add(current);
        }

        return result;
    }

    private static BoundingBox Union(IEnumerable<BoundingBox> rects)
    {
        var left = rects.Min(r => r.X);
        var top = rects.Min(r => r.Y);
        var right = rects.Max(r => r.X + r.Width);
        var bottom = rects.Max(r => r.Y + r.Height);
        return new BoundingBox(left, top, right - left, bottom - top);
    }

    private string BuildMarkdown(IEnumerable<Line> lines)
    {
        var raw = string.Join("\n", lines.Select(l => l.Text));
        if (_options.NormalizeMarkdown)
        {
            return Markdown.Normalize(raw);
        }
        return raw;
    }
}