-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathProgram.Llm.TableExtract.cs
More file actions
137 lines (115 loc) · 5.28 KB
/
Program.Llm.TableExtract.cs
File metadata and controls
137 lines (115 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
using Newtonsoft.Json.Linq;
namespace Demo
{
internal partial class Program
{
internal static void TestTableExtract1()
{
JArray pages = GetPagesFromJson(PdfExtractor.ToJson(@"..\..\..\TestDocuments\national-capitals.pdf"));
foreach (JObject page in pages)
{
int pageNum = page["page_number"]!.Value<int>();
Console.WriteLine($"\nPage {pageNum}");
foreach (JObject box in (page["boxes"] as JArray)?.OfType<JObject>() ?? Enumerable.Empty<JObject>())
{
if (!string.Equals(box["boxclass"]?.Value<string>(), "table", StringComparison.Ordinal)) continue;
var rows = ParseTableRows(box["table"]);
int rowCount = rows.Count;
int columnCount = rowCount > 0 ? rows.Max(r => r?.Count ?? 0) : 0;
Console.WriteLine($"Table: {rowCount} rows x {columnCount} columns");
foreach (var row in rows)
Console.WriteLine(string.Join(" | ", row ?? []));
}
}
}
internal static void TestTableExtract2()
{
JArray pages = GetPagesFromJson(PdfExtractor.ToJson(@"..\..\..\TestDocuments\national-capitals.pdf"));
var csvLines = new List<string>();
foreach (JObject page in pages)
{
foreach (JObject box in (page["boxes"] as JArray)?.OfType<JObject>() ?? Enumerable.Empty<JObject>())
{
if (!string.Equals(box["boxclass"]?.Value<string>(), "table", StringComparison.Ordinal)) continue;
var rows = ParseTableRows(box["table"]);
foreach (var row in rows)
{
var escaped = (row ?? []).Select(cell =>
cell.Contains(',') || cell.Contains('"')
? $"\"{cell.Replace("\"", "\"\"")}\""
: cell
);
csvLines.Add(string.Join(",", escaped));
}
csvLines.Add(string.Empty);
}
}
File.WriteAllLines("tables.csv", csvLines, Encoding.UTF8);
Console.WriteLine("Write to tables.csv");
}
internal static void TestTableExtract3()
{
JArray pages = GetPagesFromJson(PdfExtractor.ToJson(@"..\..\..\TestDocuments\national-capitals.pdf"));
var mergedRows = new List<List<string>>();
int? prevColCount = null;
foreach (JObject page in pages)
{
foreach (JObject box in (page["boxes"] as JArray)?.OfType<JObject>() ?? Enumerable.Empty<JObject>())
{
if (!string.Equals(box["boxclass"]?.Value<string>(), "table", StringComparison.Ordinal)) continue;
var rows = ParseTableRows(box["table"]);
if (rows.Count == 0)
{
prevColCount = null;
continue;
}
int colCount = rows.Max(r => r?.Count ?? 0);
if (colCount > 0 && colCount == prevColCount)
mergedRows.AddRange(rows.Skip(1));
else
mergedRows.AddRange(rows);
prevColCount = colCount > 0 ? colCount : null;
}
}
Console.WriteLine($"Merged table: {mergedRows.Count} rows");
foreach (var row in mergedRows)
Console.WriteLine(string.Join(" | ", row ?? []));
}
internal static void TestOcr()
{
PdfExtractor.ToMarkdown(@"..\..\..\TestDocuments\Ocr.pdf", useOcr: true, writeImages: false, embedImages: false);
string text = PdfExtractor.ToText(@"..\..\..\TestDocuments\Ocr.pdf", useOcr: true);
Console.WriteLine(text);
}
internal static void TestLLM2()
{
var reader = PdfExtractor.LlamaMarkdownReader();
var chunks = reader.LoadData(@"..\..\..\TestDocuments\magazine.pdf");
Directory.CreateDirectory("Output");
foreach (var chunk in chunks)
{
int pageNum = (int)chunk.ExtraInfo["page"];
Console.WriteLine(pageNum);
string filePath = $"output/page-{pageNum}.md";
File.WriteAllText(filePath, chunk.Text, Encoding.UTF8);
}
}
private static JArray GetPagesFromJson(string json)
{
JToken root = JToken.Parse(json);
return root switch
{
JArray arr => arr,
JObject obj when obj["pages"] is JArray arr => arr,
_ => throw new InvalidOperationException("Expected a JSON array or an object containing a 'pages' array.")
};
}
private static List<List<string>> ParseTableRows(JToken tableToken) =>
tableToken switch
{
JArray arr => arr.ToObject<List<List<string>>>() ?? [],
JObject obj when obj["extract"] is JArray extract => extract.ToObject<List<List<string>>>() ?? [],
_ => []
};
}
}