Skip to content

Commit bcbb192

Browse files
committed
263124: Added OCR for multiple language in same scanned PDF document
1 parent 31762c6 commit bcbb192

7 files changed

Lines changed: 60 additions & 0 deletions

File tree

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.14.36616.10 d17.14
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Perform-OCR-on-PDF-with-multiple-languages", "Perform-OCR-on-PDF-with-multiple-languages\Perform-OCR-on-PDF-with-multiple-languages.csproj", "{0DB5D151-C60A-434B-B709-DC9111D1CC8F}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{0DB5D151-C60A-434B-B709-DC9111D1CC8F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{0DB5D151-C60A-434B-B709-DC9111D1CC8F}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{0DB5D151-C60A-434B-B709-DC9111D1CC8F}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{0DB5D151-C60A-434B-B709-DC9111D1CC8F}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {6F527749-5F41-444D-BBAE-1676D3E57CF2}
24+
EndGlobalSection
25+
EndGlobal

OCR/.NET/Perform-OCR-on-PDF-with-multiple-languages/Perform-OCR-on-PDF-with-multiple-languages/Output/gitkeep.txt

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net8.0</TargetFramework>
6+
<RootNamespace>Perform_OCR_on_PDF_with_multiple_languages</RootNamespace>
7+
<ImplicitUsings>enable</ImplicitUsings>
8+
<Nullable>enable</Nullable>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="*" />
13+
</ItemGroup>
14+
15+
</Project>
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
using Syncfusion.OCRProcessor;
2+
using Syncfusion.Pdf.Graphics;
3+
using Syncfusion.Pdf.Parsing;
4+
5+
// Load the PDF document
6+
using (PdfLoadedDocument loadedDocument = new PdfLoadedDocument(Path.GetFullPath(@"Input.pdf")))
7+
{
8+
// Initialize OCR processor
9+
OCRProcessor processor = new OCRProcessor();
10+
//Sets Unicode font to preserve the Unicode characters in a PDF document.
11+
processor.UnicodeFont = new PdfTrueTypeFont(Path.GetFullPath(@"Data/ARIALUNI.ttf"), 8);
12+
// Set OCR language
13+
processor.Settings.Language = "eng+tha";
14+
// Set the path to the Tesseract language data folder
15+
processor.TessDataPath = Path.GetFullPath(@"../../Tessdata");
16+
// Perform OCR
17+
processor.PerformOCR(loadedDocument);
18+
// Save the PDF document
19+
loadedDocument.Save(Path.GetFullPath(@"Output/Output.pdf"));
20+
}

OCR/.NET/Tessdata/tha.traineddata

1.02 MB
Binary file not shown.

0 commit comments

Comments
 (0)