980096: Added offline Text Recognition from Scanned PDFs Using Tesseract OCR and Syncfusion Integration

sameerkhan001 · sameerkhan001 · commit e727dd9c2c03 · 2025-09-12T12:21:50.000+05:30
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/.dockerignore b/OCR/.NET/Offline-Tesseract-OCR-Integration/.dockerignore
@@ -0,0 +1,30 @@
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/azds.yaml
+**/bin
+**/charts
+**/docker-compose*
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+LICENSE
+README.md
+!**/.gitignore
+!.git/HEAD
+!.git/config
+!.git/packed-refs
+!.git/refs/heads/**
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.sln b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.14.36408.4 d17.14
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Offline-Tesseract-OCR-Integration", "Offline-Tesseract-OCR-Integration\Offline-Tesseract-OCR-Integration.csproj", "{92C3B623-ED53-4127-8161-975BCD7AA532}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {BEF3B3F0-759C-4D53-BF94-8EB1E0E7D2FE}
+	EndGlobalSection
+EndGlobal
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Data/Input.pdf b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Data/Input.pdf
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Dockerfile b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Dockerfile
@@ -0,0 +1,29 @@
+# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.
+
+# This stage is used when running from VS in fast mode (Default for Debug configuration)
+FROM mcr.microsoft.com/dotnet/runtime:8.0 AS 
+RUN apt-get update && apt-get install -y tesseract-ocr
+USER $APP_UID
+WORKDIR /app
+
+
+# This stage is used to build the service project
+FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
+ARG BUILD_CONFIGURATION=Release
+WORKDIR /src
+COPY ["Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj", "Offline-Tesseract-OCR-Integration/"]
+RUN dotnet restore "./Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj"
+COPY . .
+WORKDIR "/src/Offline-Tesseract-OCR-Integration"
+RUN dotnet build "./Offline-Tesseract-OCR-Integration.csproj" -c $BUILD_CONFIGURATION -o /app/build
+
+# This stage is used to publish the service project to be copied to the final stage
+FROM build AS publish
+ARG BUILD_CONFIGURATION=Release
+RUN dotnet publish "./Offline-Tesseract-OCR-Integration.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
+
+# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration)
+FROM base AS final
+WORKDIR /app
+COPY --from=publish /app/publish .
+ENTRYPOINT ["dotnet", "Offline-Tesseract-OCR-Integration.dll"]
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj
@@ -0,0 +1,24 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net8.0</TargetFramework>
+    <RootNamespace>Offline_Tesseract_OCR_Integration</RootNamespace>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+    <DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.22.1" />
+    <PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="3.119.0" />
+    <PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="31.1.18" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="Data\Input.pdf">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+
+</Project>
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj.user b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration.csproj.user
@@ -0,0 +1,6 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <ActiveDebugProfile>Container (Dockerfile)</ActiveDebugProfile>
+  </PropertyGroup>
+</Project>
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Output/.gitkeep b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Output/.gitkeep
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Program.cs b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Program.cs
@@ -0,0 +1,185 @@
+﻿using Syncfusion.Drawing;       
+using Syncfusion.OCRProcessor;     
+using Syncfusion.Pdf.Graphics;     
+using Syncfusion.Pdf.Parsing;       
+using System.Diagnostics;           
+using System.Xml.Linq;             
+using System;                       
+using System.IO;                    
+using System.Linq;                  
+
+// Main application logic
+class Program
+{
+    static void Main(string[] args)
+    {
+        // Define input and output paths
+        string inputPdfPath = Path.GetFullPath(@"Data/Input.pdf");
+        string outputPdfPath = Path.GetFullPath(@"Output/Output.pdf");
+        string outputTextPath = Path.GetFullPath(@"Output/Output.txt");
+
+        // Use 'using' statements for proper resource disposal
+        using (OCRProcessor processor = new OCRProcessor())
+        {
+            using (FileStream stream = new FileStream(inputPdfPath, FileMode.Open, FileAccess.Read))
+            {
+                using (PdfLoadedDocument lDoc = new PdfLoadedDocument(stream))
+                {
+                    processor.Settings.Language = Languages.English;
+                    IOcrEngine tesseractEngine = new Tesseract5OcrEngine();
+                    processor.ExternalEngine = tesseractEngine;
+
+                    Console.WriteLine("Performing OCR using Tesseract engine...");
+
+                    // Perform OCR on the loaded PDF document.
+                    // The result will be the extracted text from the PDF.
+                    string extractedText = processor.PerformOCR(lDoc);
+
+                    // Save the modified PDF (e.g., with hidden text layer from OCR)
+                    using (FileStream fileStream = new FileStream(outputPdfPath, FileMode.Create))
+                    {
+                        lDoc.Save(fileStream);
+                    }
+                    Console.WriteLine($"OCR processed PDF saved to '{outputPdfPath}'.");
+
+                    // Save the extracted text to a .txt file
+                    File.WriteAllText(outputTextPath, extractedText);
+                    Console.WriteLine($"Extracted text saved to '{outputTextPath}'.");
+                }
+            }
+        }
+
+        Console.WriteLine("Application finished. Press any key to exit.");
+        Console.ReadKey();
+    }
+}
+
+// Tesseract5OcrEngine implementation
+class Tesseract5OcrEngine : IOcrEngine
+{
+    private float imageHeight;
+    private float imageWidth;
+
+    public OCRLayoutResult PerformOCR(Stream stream)
+    {
+        if (stream == null || !stream.CanRead)
+        {
+            throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream));
+        }
+        stream.Position = 0;
+
+        // Determine image dimensions
+        using (MemoryStream tempMemStream = new MemoryStream())
+        {
+            stream.CopyTo(tempMemStream);
+            tempMemStream.Position = 0;
+            PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream); // Assumes compatible image utility
+            imageHeight = pdfTiffImage.Height;
+            imageWidth = pdfTiffImage.Width;
+        }
+
+        string tempImageFile = Path.GetTempFileName();
+        string tempHocrFile = tempImageFile + ".hocr";
+
+        try
+        {
+            // Write input stream to temporary image file
+            using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write))
+            {
+                stream.Position = 0;
+                stream.CopyTo(tempFileStream);
+            }
+
+            ProcessStartInfo startInfo = new ProcessStartInfo
+            {
+                FileName = "tesseract",
+                Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr",
+                RedirectStandardError = true,
+                UseShellExecute = false,
+                CreateNoWindow = true
+            };
+
+            string hocrText = null;
+            using (Process process = new Process { StartInfo = startInfo })
+            {
+                process.Start();
+                string errorOutput = process.StandardError.ReadToEnd();
+                process.WaitForExit();
+
+                if (process.ExitCode != 0)
+                {
+                    throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}");
+                }
+
+                if (File.Exists(tempHocrFile))
+                {
+                    hocrText = File.ReadAllText(tempHocrFile);
+                }
+                else
+                {
+                    throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output.");
+                }
+            }
+
+            if (string.IsNullOrEmpty(hocrText))
+            {
+                throw new Exception("HOCR text could not be generated or was empty.");
+            }
+
+            OCRLayoutResult oCRLayoutResult = new OCRLayoutResult();
+            BuildOCRLayoutResult(oCRLayoutResult, hocrText, imageWidth, imageHeight);
+            oCRLayoutResult.ImageWidth = imageWidth;
+            oCRLayoutResult.ImageHeight = imageHeight;
+
+            return oCRLayoutResult;
+        }
+        finally
+        {
+            if (File.Exists(tempImageFile)) File.Delete(tempImageFile);
+            if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile);
+            Console.WriteLine("Temporary Tesseract files cleaned up.");
+        }
+    }
+
+    void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight)
+    {
+        XDocument doc = XDocument.Parse(hOcrText, LoadOptions.None);
+        XNamespace ns = "http://www.w3.org/1999/xhtml";
+
+        foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page"))
+        {
+            Page ocrPage = new Page();
+
+            foreach (var lineElement in pageElement.Descendants(ns + "span")
+                                                  .Where(s => s.Attribute("class")?.Value == "ocr_line" ||
+                                                              s.Attribute("class")?.Value == "ocr_header"))
+            {
+                Line ocrLine = new Line();
+
+                foreach (var wordElement in lineElement.Descendants(ns + "span").Where(s => s.Attribute("class")?.Value == "ocrx_word"))
+                {
+                    Word ocrWord = new Word { Text = wordElement.Value };
+
+                    string title = wordElement.Attribute("title")?.Value;
+                    if (title != null)
+                    {
+                        string bboxString = title.Split(';')[0].Replace("bbox", "").Trim();
+                        int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray();
+
+                        if (coords.Length == 4)
+                        {
+                            float x = coords[0];
+                            float y = coords[1];
+                            float width = coords[2] - coords[0];
+                            float height = coords[3] - coords[1];
+                            ocrWord.Rectangle = new RectangleF(x, y, width, height);
+                        }
+                    }
+                    ocrLine.Add(ocrWord);
+                }
+                ocrPage.Add(ocrLine);
+            }
+            ocr.Add(ocrPage);
+        }
+    }
+}
diff --git a/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Properties/launchSettings.json b/OCR/.NET/Offline-Tesseract-OCR-Integration/Offline-Tesseract-OCR-Integration/Properties/launchSettings.json
@@ -0,0 +1,10 @@
+{
+  "profiles": {
+    "Offline-Tesseract-OCR-Integration": {
+      "commandName": "Project"
+    },
+    "Container (Dockerfile)": {
+      "commandName": "Docker"
+    }
+  }
+}