1+ using Syncfusion . Drawing ;
2+ using Syncfusion . OCRProcessor ;
3+ using Syncfusion . Pdf . Graphics ;
4+ using Syncfusion . Pdf . Parsing ;
5+ using System . Diagnostics ;
6+ using System . Xml . Linq ;
7+ using System ;
8+ using System . IO ;
9+ using System . Linq ;
10+
11+ // Main application logic
12+ class Program
13+ {
14+ static void Main ( string [ ] args )
15+ {
16+ // Define input and output paths
17+ string inputPdfPath = Path . GetFullPath ( @"Data/Input.pdf" ) ;
18+ string outputPdfPath = Path . GetFullPath ( @"Output/Output.pdf" ) ;
19+ string outputTextPath = Path . GetFullPath ( @"Output/Output.txt" ) ;
20+
21+ // Use 'using' statements for proper resource disposal
22+ using ( OCRProcessor processor = new OCRProcessor ( ) )
23+ {
24+ using ( FileStream stream = new FileStream ( inputPdfPath , FileMode . Open , FileAccess . Read ) )
25+ {
26+ using ( PdfLoadedDocument lDoc = new PdfLoadedDocument ( stream ) )
27+ {
28+ processor . Settings . Language = Languages . English ;
29+ IOcrEngine tesseractEngine = new Tesseract5OcrEngine ( ) ;
30+ processor . ExternalEngine = tesseractEngine ;
31+
32+ Console . WriteLine ( "Performing OCR using Tesseract engine..." ) ;
33+
34+ // Perform OCR on the loaded PDF document.
35+ // The result will be the extracted text from the PDF.
36+ string extractedText = processor . PerformOCR ( lDoc ) ;
37+
38+ // Save the modified PDF (e.g., with hidden text layer from OCR)
39+ using ( FileStream fileStream = new FileStream ( outputPdfPath , FileMode . Create ) )
40+ {
41+ lDoc . Save ( fileStream ) ;
42+ }
43+ Console . WriteLine ( $ "OCR processed PDF saved to '{ outputPdfPath } '.") ;
44+
45+ // Save the extracted text to a .txt file
46+ File . WriteAllText ( outputTextPath , extractedText ) ;
47+ Console . WriteLine ( $ "Extracted text saved to '{ outputTextPath } '.") ;
48+ }
49+ }
50+ }
51+
52+ Console . WriteLine ( "Application finished. Press any key to exit." ) ;
53+ Console . ReadKey ( ) ;
54+ }
55+ }
56+
57+ // Tesseract5OcrEngine implementation
58+ class Tesseract5OcrEngine : IOcrEngine
59+ {
60+ private float imageHeight ;
61+ private float imageWidth ;
62+
63+ public OCRLayoutResult PerformOCR ( Stream stream )
64+ {
65+ if ( stream == null || ! stream . CanRead )
66+ {
67+ throw new ArgumentException ( "Input stream is null or not readable for OCR." , nameof ( stream ) ) ;
68+ }
69+ stream . Position = 0 ;
70+
71+ // Determine image dimensions
72+ using ( MemoryStream tempMemStream = new MemoryStream ( ) )
73+ {
74+ stream . CopyTo ( tempMemStream ) ;
75+ tempMemStream . Position = 0 ;
76+ PdfTiffImage pdfTiffImage = new PdfTiffImage ( tempMemStream ) ; // Assumes compatible image utility
77+ imageHeight = pdfTiffImage . Height ;
78+ imageWidth = pdfTiffImage . Width ;
79+ }
80+
81+ string tempImageFile = Path . GetTempFileName ( ) ;
82+ string tempHocrFile = tempImageFile + ".hocr" ;
83+
84+ try
85+ {
86+ // Write input stream to temporary image file
87+ using ( FileStream tempFileStream = new FileStream ( tempImageFile , FileMode . Create , FileAccess . Write ) )
88+ {
89+ stream . Position = 0 ;
90+ stream . CopyTo ( tempFileStream ) ;
91+ }
92+
93+ ProcessStartInfo startInfo = new ProcessStartInfo
94+ {
95+ FileName = "tesseract" ,
96+ Arguments = $ "\" { tempImageFile } \" \" { tempImageFile } \" -l eng hocr",
97+ RedirectStandardError = true ,
98+ UseShellExecute = false ,
99+ CreateNoWindow = true
100+ } ;
101+
102+ string hocrText = null ;
103+ using ( Process process = new Process { StartInfo = startInfo } )
104+ {
105+ process . Start ( ) ;
106+ string errorOutput = process . StandardError . ReadToEnd ( ) ;
107+ process . WaitForExit ( ) ;
108+
109+ if ( process . ExitCode != 0 )
110+ {
111+ throw new Exception ( $ "Tesseract process failed with exit code { process . ExitCode } . Error: { errorOutput } ") ;
112+ }
113+
114+ if ( File . Exists ( tempHocrFile ) )
115+ {
116+ hocrText = File . ReadAllText ( tempHocrFile ) ;
117+ }
118+ else
119+ {
120+ throw new Exception ( "HOCR output file not found. Tesseract might have failed or not produced output." ) ;
121+ }
122+ }
123+
124+ if ( string . IsNullOrEmpty ( hocrText ) )
125+ {
126+ throw new Exception ( "HOCR text could not be generated or was empty." ) ;
127+ }
128+
129+ OCRLayoutResult oCRLayoutResult = new OCRLayoutResult ( ) ;
130+ BuildOCRLayoutResult ( oCRLayoutResult , hocrText , imageWidth , imageHeight ) ;
131+ oCRLayoutResult . ImageWidth = imageWidth ;
132+ oCRLayoutResult . ImageHeight = imageHeight ;
133+
134+ return oCRLayoutResult ;
135+ }
136+ finally
137+ {
138+ if ( File . Exists ( tempImageFile ) ) File . Delete ( tempImageFile ) ;
139+ if ( File . Exists ( tempHocrFile ) ) File . Delete ( tempHocrFile ) ;
140+ Console . WriteLine ( "Temporary Tesseract files cleaned up." ) ;
141+ }
142+ }
143+
144+ void BuildOCRLayoutResult ( OCRLayoutResult ocr , string hOcrText , float imageWidth , float imageHeight )
145+ {
146+ XDocument doc = XDocument . Parse ( hOcrText , LoadOptions . None ) ;
147+ XNamespace ns = "http://www.w3.org/1999/xhtml" ;
148+
149+ foreach ( var pageElement in doc . Descendants ( ns + "div" ) . Where ( d => d . Attribute ( "class" ) ? . Value == "ocr_page" ) )
150+ {
151+ Page ocrPage = new Page ( ) ;
152+
153+ foreach ( var lineElement in pageElement . Descendants ( ns + "span" )
154+ . Where ( s => s . Attribute ( "class" ) ? . Value == "ocr_line" ||
155+ s . Attribute ( "class" ) ? . Value == "ocr_header" ) )
156+ {
157+ Line ocrLine = new Line ( ) ;
158+
159+ foreach ( var wordElement in lineElement . Descendants ( ns + "span" ) . Where ( s => s . Attribute ( "class" ) ? . Value == "ocrx_word" ) )
160+ {
161+ Word ocrWord = new Word { Text = wordElement . Value } ;
162+
163+ string title = wordElement . Attribute ( "title" ) ? . Value ;
164+ if ( title != null )
165+ {
166+ string bboxString = title . Split ( ';' ) [ 0 ] . Replace ( "bbox" , "" ) . Trim ( ) ;
167+ int [ ] coords = bboxString . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) . Select ( int . Parse ) . ToArray ( ) ;
168+
169+ if ( coords . Length == 4 )
170+ {
171+ float x = coords [ 0 ] ;
172+ float y = coords [ 1 ] ;
173+ float width = coords [ 2 ] - coords [ 0 ] ;
174+ float height = coords [ 3 ] - coords [ 1 ] ;
175+ ocrWord . Rectangle = new RectangleF ( x , y , width , height ) ;
176+ }
177+ }
178+ ocrLine . Add ( ocrWord ) ;
179+ }
180+ ocrPage . Add ( ocrLine ) ;
181+ }
182+ ocr . Add ( ocrPage ) ;
183+ }
184+ }
185+ }
0 commit comments