Skip to content

Commit 96ece75

Browse files
committed
fixed some problems in extracting table from page
1 parent 62f16d5 commit 96ece75

6 files changed

Lines changed: 114 additions & 13 deletions

File tree

Demo/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ static void TestTable()
132132

133133
try
134134
{
135-
string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf");
135+
string testFilePath = Path.GetFullPath("../../../TestDocuments/err_table.pdf");
136136

137137
if (!File.Exists(testFilePath))
138138
{

Demo/TestDocuments/err_table.pdf

3.67 MB
Binary file not shown.

MuPDF.NET.Test/TableTest.cs

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,98 @@
1-
using System;
1+
using System;
22
using System.Collections.Generic;
3+
using System.IO;
34
using System.Linq;
4-
using System.Text;
5-
using System.Threading.Tasks;
5+
using MuPDF.NET;
6+
using NUnit.Framework;
67

78
namespace MuPDF.NET.Test
89
{
910
public class TableTest
1011
{
12+
/// <summary>
13+
/// Table test based on Demo Program.TestTable():
14+
/// Loads err_table.pdf, gets tables with lines_strict/lines/text strategies,
15+
/// asserts Extract() and ToMarkdown() work for any tables found.
16+
/// </summary>
17+
[Test]
18+
public void TestTable()
19+
{
20+
string testFilePath = Path.GetFullPath(Path.Combine(TestContext.CurrentContext.TestDirectory, "../../../resources/err_table.pdf"));
21+
Assert.That(File.Exists(testFilePath), Is.True, $"Test file not found: {testFilePath}");
22+
23+
Document doc = new Document(testFilePath);
24+
try
25+
{
26+
Assert.That(doc.PageCount, Is.GreaterThanOrEqualTo(1));
27+
28+
Page page = doc[0];
29+
30+
// Test 1: Get tables with 'lines_strict' strategy (as in Demo)
31+
List<Table> tables = Utils.GetTables(
32+
page,
33+
clip: page.Rect,
34+
vertical_strategy: "lines_strict",
35+
horizontal_strategy: "lines_strict");
36+
37+
Assert.That(tables, Is.Not.Null);
38+
39+
if (tables.Count == 0)
40+
{
41+
// Test 2: Fallback with 'lines' strategy (as in Demo)
42+
tables = Utils.GetTables(
43+
page,
44+
clip: page.Rect,
45+
vertical_strategy: "lines",
46+
horizontal_strategy: "lines");
47+
}
48+
49+
// Test 3: Get tables with 'text' strategy (as in Demo)
50+
List<Table> textTables = Utils.GetTables(
51+
page,
52+
clip: page.Rect,
53+
vertical_strategy: "text",
54+
horizontal_strategy: "text");
55+
56+
Assert.That(textTables, Is.Not.Null);
57+
58+
// For each table found with lines_strict/lines: validate structure and Extract/ToMarkdown
59+
for (int i = 0; i < tables.Count; i++)
60+
{
61+
Table table = tables[i];
62+
Assert.That(table.row_count, Is.GreaterThanOrEqualTo(0));
63+
Assert.That(table.col_count, Is.GreaterThanOrEqualTo(0));
64+
65+
List<List<string>> tableData = table.Extract();
66+
Assert.That(tableData, Is.Not.Null);
67+
68+
string markdown = table.ToMarkdown(clean: false, fillEmpty: true);
69+
Assert.That(markdown, Is.Not.Null);
70+
}
71+
72+
// Test 4: Get tables from all pages (as in Demo)
73+
int totalTables = 0;
74+
for (int pageNum = 0; pageNum < doc.PageCount; pageNum++)
75+
{
76+
Page currentPage = doc[pageNum];
77+
List<Table> pageTables = Utils.GetTables(
78+
currentPage,
79+
clip: currentPage.Rect,
80+
vertical_strategy: "lines_strict",
81+
horizontal_strategy: "lines_strict");
82+
if (pageTables.Count > 0)
83+
totalTables += pageTables.Count;
84+
currentPage.Dispose();
85+
}
86+
87+
Assert.That(totalTables, Is.GreaterThanOrEqualTo(0));
88+
page.Dispose();
89+
}
90+
finally
91+
{
92+
doc.Close();
93+
}
94+
}
95+
1196
/*
1297
[Test]
1398
public void BorderedTable()
3.67 MB
Binary file not shown.

MuPDF.NET/MuPDF.NET.nuspec

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
33
<metadata>
44
<id>MuPDF.NET</id>
5-
<version>3.2.13-rc.11</version>
5+
<version>3.2.13-rc.12</version>
66
<authors>Artifex Software Inc.</authors>
77
<requireLicenseAcceptance>true</requireLicenseAcceptance>
88
<license type="file">LICENSE.md</license>

MuPDF.NET/Table.cs

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,7 +2556,7 @@ bool RowHasBold(Rect rowBbox)
25562556

25572557
// Get text blocks above table
25582558
dynamic pageInfo = page.GetText("dict", clip: clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT);
2559-
List<Block> blocks = pageInfo?.BLOCKS ?? new List<Block>();
2559+
List<Block> blocks = pageInfo?.Blocks ?? new List<Block>();
25602560

25612561
// Non-empty, non-superscript spans above table, sorted descending by y1
25622562
var spans = new List<Dictionary<string, object>>();
@@ -2601,7 +2601,7 @@ bool RowHasBold(Rect rowBbox)
26012601

26022602
float y1 = Convert.ToSingle(sbbox[3]);
26032603
float h = y1 - Convert.ToSingle(sbbox[1]);
2604-
bool bold = ((int)s["flags"] & (int)FontStyle.TEXT_FONT_BOLD) != 0;
2604+
bool bold = (Convert.ToInt32(s["flags"]) & (int)FontStyle.TEXT_FONT_BOLD) != 0;
26052605

26062606
if (i == 0)
26072607
{
@@ -3145,19 +3145,27 @@ private List<Edge> GetEdges()
31453145
}
31463146
}
31473147

3148-
List<Edge> vBase = new List<Edge>();
3148+
List<Edge> vBase;
31493149
if (vStrat == "lines")
31503150
{
3151-
vBase = TableGlobals.EDGES.Where(e => e.orientation == "v").ToList();
3151+
vBase = EdgeProcessing.FilterEdges(TableGlobals.EDGES, "v");
31523152
}
31533153
else if (vStrat == "lines_strict")
31543154
{
3155-
vBase = TableGlobals.EDGES.Where(e => e.orientation == "v" && e.object_type == "line").ToList();
3155+
vBase = EdgeProcessing.FilterEdges(TableGlobals.EDGES, "v", "line");
31563156
}
31573157
else if (vStrat == "text")
31583158
{
31593159
vBase = EdgeProcessing.WordsToEdgesV(words, (int)settings.min_words_vertical);
31603160
}
3161+
else if (vStrat == "explicit")
3162+
{
3163+
vBase = new List<Edge>();
3164+
}
3165+
else
3166+
{
3167+
vBase = new List<Edge>();
3168+
}
31613169

31623170
var v = vBase.Concat(vExplicit).ToList();
31633171

@@ -3197,19 +3205,27 @@ private List<Edge> GetEdges()
31973205
}
31983206
}
31993207

3200-
List<Edge> hBase = new List<Edge>();
3208+
List<Edge> hBase;
32013209
if (hStrat == "lines")
32023210
{
3203-
hBase = TableGlobals.EDGES.Where(e => e.orientation == "h").ToList();
3211+
hBase = EdgeProcessing.FilterEdges(TableGlobals.EDGES, "h");
32043212
}
32053213
else if (hStrat == "lines_strict")
32063214
{
3207-
hBase = TableGlobals.EDGES.Where(e => e.orientation == "h" && e.object_type == "line").ToList();
3215+
hBase = EdgeProcessing.FilterEdges(TableGlobals.EDGES, "h", "line");
32083216
}
32093217
else if (hStrat == "text")
32103218
{
32113219
hBase = EdgeProcessing.WordsToEdgesH(words, (int)settings.min_words_horizontal);
32123220
}
3221+
else if (hStrat == "explicit")
3222+
{
3223+
hBase = new List<Edge>();
3224+
}
3225+
else
3226+
{
3227+
hBase = new List<Edge>();
3228+
}
32133229

32143230
var h = hBase.Concat(hExplicit).ToList();
32153231

0 commit comments

Comments
 (0)