Skip to content

Commit e727dd9

Browse files
committed
980096: Added offline Text Recognition from Scanned PDFs Using Tesseract OCR and Syncfusion Integration
1 parent e55dbfb commit e727dd9

File tree

9 files changed

+309
-0
lines changed

9 files changed

+309
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
**/.classpath
2+
**/.dockerignore
3+
**/.env
4+
**/.git
5+
**/.gitignore
6+
**/.project
7+
**/.settings
8+
**/.toolstarget
9+
**/.vs
10+
**/.vscode
11+
**/*.*proj.user
12+
**/*.dbmdl
13+
**/*.jfm
14+
**/azds.yaml
15+
**/bin
16+
**/charts
17+
**/docker-compose*
18+
**/Dockerfile*
19+
**/node_modules
20+
**/npm-debug.log
21+
**/obj
22+
**/secrets.dev.yaml
23+
**/values.dev.yaml
24+
LICENSE
25+
README.md
26+
!**/.gitignore
27+
!.git/HEAD
28+
!.git/config
29+
!.git/packed-refs
30+
!.git/refs/heads/**
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.14.36408.4 d17.14
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Offline-Tesseract-OCR-Integration", "Offline-Tesseract-OCR-Integration\Offline-Tesseract-OCR-Integration.csproj", "{92C3B623-ED53-4127-8161-975BCD7AA532}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {BEF3B3F0-759C-4D53-BF94-8EB1E0E7D2FE}
24+
EndGlobalSection
25+
EndGlobal
Binary file not shown.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.
2+
3+
# This stage is used when running from VS in fast mode (Default for Debug configuration)
4+
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS
5+
RUN apt-get update && apt-get install -y tesseract-ocr
6+
USER $APP_UID
7+
WORKDIR /app
8+
9+
10+
# This stage is used to build the service project
11+
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
12+
ARG BUILD_CONFIGURATION=Release
13+
WORKDIR /src
14+
COPY ["Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj", "Offline-Tesseract-OCR-Integration/"]
15+
RUN dotnet restore "./Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj"
16+
COPY . .
17+
WORKDIR "/src/Offline-Tesseract-OCR-Integration"
18+
RUN dotnet build "./Offline-Tesseract-OCR-Integration.csproj" -c $BUILD_CONFIGURATION -o /app/build
19+
20+
# This stage is used to publish the service project to be copied to the final stage
21+
FROM build AS publish
22+
ARG BUILD_CONFIGURATION=Release
23+
RUN dotnet publish "./Offline-Tesseract-OCR-Integration.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
24+
25+
# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration)
26+
FROM base AS final
27+
WORKDIR /app
28+
COPY --from=publish /app/publish .
29+
ENTRYPOINT ["dotnet", "Offline-Tesseract-OCR-Integration.dll"]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net8.0</TargetFramework>
6+
<RootNamespace>Offline_Tesseract_OCR_Integration</RootNamespace>
7+
<ImplicitUsings>enable</ImplicitUsings>
8+
<Nullable>enable</Nullable>
9+
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
10+
</PropertyGroup>
11+
12+
<ItemGroup>
13+
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.22.1" />
14+
<PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="3.119.0" />
15+
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="31.1.18" />
16+
</ItemGroup>
17+
18+
<ItemGroup>
19+
<None Update="Data\Input.pdf">
20+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
21+
</None>
22+
</ItemGroup>
23+
24+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<PropertyGroup>
4+
<ActiveDebugProfile>Container (Dockerfile)</ActiveDebugProfile>
5+
</PropertyGroup>
6+
</Project>

OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Output/.gitkeep

Whitespace-only changes.
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
using Syncfusion.Drawing;
2+
using Syncfusion.OCRProcessor;
3+
using Syncfusion.Pdf.Graphics;
4+
using Syncfusion.Pdf.Parsing;
5+
using System.Diagnostics;
6+
using System.Xml.Linq;
7+
using System;
8+
using System.IO;
9+
using System.Linq;
10+
11+
// Main application logic
12+
class Program
13+
{
14+
static void Main(string[] args)
15+
{
16+
// Define input and output paths
17+
string inputPdfPath = Path.GetFullPath(@"Data/Input.pdf");
18+
string outputPdfPath = Path.GetFullPath(@"Output/Output.pdf");
19+
string outputTextPath = Path.GetFullPath(@"Output/Output.txt");
20+
21+
// Use 'using' statements for proper resource disposal
22+
using (OCRProcessor processor = new OCRProcessor())
23+
{
24+
using (FileStream stream = new FileStream(inputPdfPath, FileMode.Open, FileAccess.Read))
25+
{
26+
using (PdfLoadedDocument lDoc = new PdfLoadedDocument(stream))
27+
{
28+
processor.Settings.Language = Languages.English;
29+
IOcrEngine tesseractEngine = new Tesseract5OcrEngine();
30+
processor.ExternalEngine = tesseractEngine;
31+
32+
Console.WriteLine("Performing OCR using Tesseract engine...");
33+
34+
// Perform OCR on the loaded PDF document.
35+
// The result will be the extracted text from the PDF.
36+
string extractedText = processor.PerformOCR(lDoc);
37+
38+
// Save the modified PDF (e.g., with hidden text layer from OCR)
39+
using (FileStream fileStream = new FileStream(outputPdfPath, FileMode.Create))
40+
{
41+
lDoc.Save(fileStream);
42+
}
43+
Console.WriteLine($"OCR processed PDF saved to '{outputPdfPath}'.");
44+
45+
// Save the extracted text to a .txt file
46+
File.WriteAllText(outputTextPath, extractedText);
47+
Console.WriteLine($"Extracted text saved to '{outputTextPath}'.");
48+
}
49+
}
50+
}
51+
52+
Console.WriteLine("Application finished. Press any key to exit.");
53+
Console.ReadKey();
54+
}
55+
}
56+
57+
// Tesseract5OcrEngine implementation
58+
class Tesseract5OcrEngine : IOcrEngine
59+
{
60+
private float imageHeight;
61+
private float imageWidth;
62+
63+
public OCRLayoutResult PerformOCR(Stream stream)
64+
{
65+
if (stream == null || !stream.CanRead)
66+
{
67+
throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream));
68+
}
69+
stream.Position = 0;
70+
71+
// Determine image dimensions
72+
using (MemoryStream tempMemStream = new MemoryStream())
73+
{
74+
stream.CopyTo(tempMemStream);
75+
tempMemStream.Position = 0;
76+
PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream); // Assumes compatible image utility
77+
imageHeight = pdfTiffImage.Height;
78+
imageWidth = pdfTiffImage.Width;
79+
}
80+
81+
string tempImageFile = Path.GetTempFileName();
82+
string tempHocrFile = tempImageFile + ".hocr";
83+
84+
try
85+
{
86+
// Write input stream to temporary image file
87+
using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write))
88+
{
89+
stream.Position = 0;
90+
stream.CopyTo(tempFileStream);
91+
}
92+
93+
ProcessStartInfo startInfo = new ProcessStartInfo
94+
{
95+
FileName = "tesseract",
96+
Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr",
97+
RedirectStandardError = true,
98+
UseShellExecute = false,
99+
CreateNoWindow = true
100+
};
101+
102+
string hocrText = null;
103+
using (Process process = new Process { StartInfo = startInfo })
104+
{
105+
process.Start();
106+
string errorOutput = process.StandardError.ReadToEnd();
107+
process.WaitForExit();
108+
109+
if (process.ExitCode != 0)
110+
{
111+
throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}");
112+
}
113+
114+
if (File.Exists(tempHocrFile))
115+
{
116+
hocrText = File.ReadAllText(tempHocrFile);
117+
}
118+
else
119+
{
120+
throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output.");
121+
}
122+
}
123+
124+
if (string.IsNullOrEmpty(hocrText))
125+
{
126+
throw new Exception("HOCR text could not be generated or was empty.");
127+
}
128+
129+
OCRLayoutResult oCRLayoutResult = new OCRLayoutResult();
130+
BuildOCRLayoutResult(oCRLayoutResult, hocrText, imageWidth, imageHeight);
131+
oCRLayoutResult.ImageWidth = imageWidth;
132+
oCRLayoutResult.ImageHeight = imageHeight;
133+
134+
return oCRLayoutResult;
135+
}
136+
finally
137+
{
138+
if (File.Exists(tempImageFile)) File.Delete(tempImageFile);
139+
if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile);
140+
Console.WriteLine("Temporary Tesseract files cleaned up.");
141+
}
142+
}
143+
144+
void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight)
145+
{
146+
XDocument doc = XDocument.Parse(hOcrText, LoadOptions.None);
147+
XNamespace ns = "http://www.w3.org/1999/xhtml";
148+
149+
foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page"))
150+
{
151+
Page ocrPage = new Page();
152+
153+
foreach (var lineElement in pageElement.Descendants(ns + "span")
154+
.Where(s => s.Attribute("class")?.Value == "ocr_line" ||
155+
s.Attribute("class")?.Value == "ocr_header"))
156+
{
157+
Line ocrLine = new Line();
158+
159+
foreach (var wordElement in lineElement.Descendants(ns + "span").Where(s => s.Attribute("class")?.Value == "ocrx_word"))
160+
{
161+
Word ocrWord = new Word { Text = wordElement.Value };
162+
163+
string title = wordElement.Attribute("title")?.Value;
164+
if (title != null)
165+
{
166+
string bboxString = title.Split(';')[0].Replace("bbox", "").Trim();
167+
int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray();
168+
169+
if (coords.Length == 4)
170+
{
171+
float x = coords[0];
172+
float y = coords[1];
173+
float width = coords[2] - coords[0];
174+
float height = coords[3] - coords[1];
175+
ocrWord.Rectangle = new RectangleF(x, y, width, height);
176+
}
177+
}
178+
ocrLine.Add(ocrWord);
179+
}
180+
ocrPage.Add(ocrLine);
181+
}
182+
ocr.Add(ocrPage);
183+
}
184+
}
185+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"profiles": {
3+
"Offline-Tesseract-OCR-Integration": {
4+
"commandName": "Project"
5+
},
6+
"Container (Dockerfile)": {
7+
"commandName": "Docker"
8+
}
9+
}
10+
}

0 commit comments

Comments
 (0)