Skip to content

Commit af7bed7

Browse files
authored
bug fix - more double quotes for better filename handling
1 parent 7caf9cf commit af7bed7

1 file changed

Lines changed: 176 additions & 0 deletions

File tree

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
ECHO OFF
2+
SETLOCAL
3+
REM ~ ###################################################################################################################
4+
REM ~ Searchable Image PDF Creat-O-Mat
5+
SET VERSION=1.2
6+
REM ~ This script creates a searchable PDF out of a PDF with one or more scanned pages. It is possible to drag and drop one or multiple PDF files onto this batch file to start the process.
7+
REM ~ But you can use the command line (<script name> [pdf filename #1] [pdf filename #2] ... [pdf filename #n]) too.
8+
REM ~
9+
REM ~ Author: TB / License: MIT / https://github.com/timberger/Searchable-Image-PDF-Creat-O-Mat/
10+
REM ~
11+
REM ~ Prerequisites:
12+
REM ~ ImageMagick (7.0.8-27 and newer) https://imagemagick.org/ | License: https://imagemagick.org/script/license.php
13+
REM ~ Ghostscript (9.x) https://www.ghostscript.com/
14+
REM ~ Tesseract (4.0 and newer) https://github.com/tesseract-ocr/tesseract/wiki | http://www.apache.org/licenses/LICENSE-2.0
15+
REM ~ OS: Microsoft Windows 7 (with PowerShell); 8; 8.1
16+
REM ~
17+
REM ~ Preferences:
18+
REM ~ (leave no whitespace between the foldername and the '=' / do not use "):
19+
SET IMAGEMAGIC=C:\Program Files\ImageMagick\magick.exe
20+
SET GHOSTSCRIPT=C:\Program Files\gs\gs9.23\bin\gswin64c.exe
21+
SET TESSERACT=C:\Program Files (x86)\Tesseract-OCR\tesseract.exe
22+
REM ~ SRCLANG shall contain the abbreviations of the installed Tesseract languages which shall be searched for in the scanned files [default: eng]. Multiple languages e.g.: deu+eng - see https://github.com/tesseract-ocr/tesseract/wiki/Data-Files
23+
SET SRCLANG=deu
24+
REM ~ The scanned page can be deskewed before it is processed with Tesseract or not [default: true / alternative: false]. It is recommended to deskew the sanned page because it increases the success rate of the OCR software. But it will take more time.
25+
SET DESKEW=true
26+
REM ~ RESULTFOLDER is the folder where the searchable PDF will be stored (%CD% is the directory which contains this script) [default: %CD%\results]
27+
SET RESULTFOLDER=%CD%\searchable_PDF
28+
REM ~ TMPFOLDER is the folder where the extracted image files will be stored temporaly (the folder will be created and removed automatically during each run) [default: %CD%\temp]
29+
SET TMPFOLDER=%CD%\temp
30+
REM ~ After Imagemagick and Tesseract have created the new PDF file it has usually a bigger file size. But it can be re-packed with Ghostscript which compresses the image file to a certain resolution e.g. screen (72dpi), ebook (150dpi), printer(300dpi), prepress(300dpi+colorpreserving)
31+
SET REPACKPROFILE=printer
32+
REM ~ ###################################################################################################################
33+
34+
REM ~ clear the screen (/ the command line window)
35+
CLS
36+
ECHO OFF
37+
38+
REM ~ starting the stop watch
39+
SET StartPosition=%time:~0,8%
40+
41+
REM ~ command line window candy: blue background color / white font color (not in Windows 10)
42+
COLOR 1F
43+
44+
ECHO ### Searchable Image PDF Creat-O-Mat %VERSION% ###
45+
46+
REM ~ Checking the preferences
47+
REM ~ Does the ImageMagick location exist?
48+
IF NOT EXIST "%IMAGEMAGIC%" (
49+
ECHO The ImageMagick location seems to be wrong. Please check the preferences.
50+
GOTO :SCRIPTEND
51+
)
52+
REM ~ Does the ImageMagick location exist?
53+
IF NOT EXIST "%GHOSTSCRIPT%" (
54+
ECHO The Ghostscript location seems to be wrong. Please check the preferences.
55+
GOTO :SCRIPTEND
56+
)
57+
REM ~ Does the Tesseract location exist?
58+
IF NOT EXIST "%TESSERACT%" (
59+
ECHO The Tesseract location seems to be wrong. Please check the preferences.
60+
GOTO :SCRIPTEND
61+
)
62+
REM ~ Is the Tesseract langauge package abbrevation of the correct pattern?
63+
FOR /F "usebackq tokens=*" %%i IN (`PowerShell -noninteractive -NoProfile "&{ '%SRCLANG%' | Select-String -Pattern '^([a-z]{3}_?([a-z]{3})?)(\+([a-z]{3}_?([a-z]{3})?))*$' -Quiet}"`) DO SET RST=%%i
64+
IF /I NOT "%RST%" == "true" (
65+
ECHO The language settings seem to be wrong. Please check the preferences.
66+
GOTO :SCRIPTEND
67+
)
68+
REM ~ IF there is no subfolder e.g. temp\ (for the extracted pictures) THEN create it
69+
IF NOT EXIST "%TMPFOLDER%" (
70+
MKDIR "%TMPFOLDER%"
71+
IF %ERRORLEVEL% GEQ 1 (
72+
ECHO Unable to create %TMPFOLDER%
73+
GOTO :SCRIPTEND
74+
)
75+
)
76+
REM ~ IF there is no subfolder for the searchable PDF files THEN create it
77+
IF NOT EXIST "%RESULTFOLDER%" (
78+
MKDIR "%RESULTFOLDER%"
79+
IF %ERRORLEVEL% GEQ 1 (
80+
ECHO Unable to create %RESULTFOLDER%
81+
GOTO :SCRIPTEND
82+
)
83+
)
84+
REM ~ IF the first argument given to this script is empty THEN jump to the end of the loop and the script
85+
IF "%~1" == "" (
86+
ECHO Please, drag and drop a PDF with a scanned page onto this file OR write its filename with a whitespace behind filename of the script.
87+
GOTO :LOOPEND
88+
) ELSE (
89+
REM ~ Count the arguments given to this script
90+
REM ~ source: https://en.wikibooks.org/wiki/Windows_Programming/Programming_CMD#Command-Line_Interfacing
91+
SET ARGCOUNT=0
92+
FOR %%x IN (%*) DO SET /A ARGCOUNT+=1
93+
94+
REM ~ Init the file counter
95+
SET /a AMOUNT_OF_FILES=1
96+
)
97+
:LOOP
98+
ECHO ### File %AMOUNT_OF_FILES% / %ARGCOUNT% ###
99+
ECHO %~1
100+
101+
REM ~ Resolution which Imagemagick and Tesseract shall use to handle the images (in DPI / default:300)
102+
SET RESDPI=300
103+
104+
REM ~ IF the file does not exist THEN skip it or ELSE do the whole process
105+
IF NOT EXIST "%~1" (
106+
ECHO The file "%~1" does not exist.
107+
) ELSE (
108+
REM ~ Start the ImageMagic to extract the scanned page from the PDF file
109+
ECHO Extracting the page^(s^) from the PDF file ^(density: %RESDPI% dpi^) ...
110+
"%IMAGEMAGIC%" -density %RESDPI% -units pixelsperinch -quality 85 "%~1" "%TMPFOLDER%\output_%AMOUNT_OF_FILES%-page_%%03d.png"
111+
ECHO DONE
112+
113+
REM ~ deskew the rerieved image(s) OR not and just build the file with filenames of the retrieved pages
114+
IF "%DESKEW%"=="true" (
115+
FOR /R "%TMPFOLDER%" %%f IN (output_%AMOUNT_OF_FILES%-page_*.png) DO (
116+
ECHO Deskewing page %%~nf.png
117+
REM ~ -set option:deskew:auto-crop true -background white -sharpen 0x1.0 -sharpen 0.25x0.5
118+
"%IMAGEMAGIC%" %TMPFOLDER%\%%~nf.png -deskew 80 %TMPFOLDER%\%%~nf_ds.png
119+
ECHO %TMPFOLDER%\%%~nf_ds.png >> "%TMPFOLDER%\pageimagefilenames.txt"
120+
)
121+
ECHO DONE
122+
) ELSE (
123+
FOR /R "%TMPFOLDER%" %%f IN (output_%AMOUNT_OF_FILES%-page_*.png) DO (
124+
ECHO %TMPFOLDER%\%%~nf.png >> "%TMPFOLDER%\pageimagefilenames.txt"
125+
)
126+
)
127+
128+
REM ~ Start the OCR program (input: a picture file with scanned text / output: a searchable PDF file )
129+
"%TESSERACT%" -l %SRCLANG% --dpi %RESDPI% "%TMPFOLDER%\pageimagefilenames.txt" "%TMPFOLDER%\%~n1" pdf
130+
131+
REM ~ Repack the new PDF file with the text layer OR just move it from the TMP folder to the result without repacking
132+
IF "%REPACKPROFILE%"=="screen" GOTO :REPACKING
133+
IF "%REPACKPROFILE%"=="ebook" GOTO :REPACKING
134+
IF "%REPACKPROFILE%"=="printer" GOTO :REPACKING
135+
IF "%REPACKPROFILE%"=="prepress" GOTO :REPACKING
136+
REM ~ IF REPACKPROFILE is not equal to screen, ebook, printer or prepress
137+
move "%TMPFOLDER%\%~n1.pdf" "%RESULTFOLDER%\%~n1.pdf"
138+
:REPACKING
139+
ECHO Repacking the output PDF file ^(profile: %REPACKPROFILE%^) ...
140+
"%GHOSTSCRIPT%" -q -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/%REPACKPROFILE% -dNOPAUSE -dBATCH -dQUIET -sOutputFile="%RESULTFOLDER%\%~n1.pdf" "%TMPFOLDER%\%~n1.pdf"
141+
DEL "%TMPFOLDER%\%~n1.pdf"
142+
ECHO DONE
143+
144+
REM ~ Delete the extratcted picture files and the file list from the temp-folder
145+
DEL "%TMPFOLDER%\output_%AMOUNT_OF_FILES%-page_*.png"
146+
DEL "%TMPFOLDER%\pageimagefilenames.txt"
147+
)
148+
SET /a "AMOUNT_OF_FILES=%AMOUNT_OF_FILES% + 1"
149+
150+
REM ~ `SHIFT` fills '%1' with the content of the second argument (`%2`), %2 with the content of third argument (`%3`) and so on
151+
SHIFT
152+
153+
REM ~ IF the AMOUNT_OF_FILES dragged onto this .bat is smaller or equal to the total amount of file/arguments AND the next argument is not empty string THEN repeat the last step again. (Otherwise continue to the end of the script.)
154+
IF %AMOUNT_OF_FILES% LEQ %ARGCOUNT% IF NOT "%~1" == "" (
155+
GOTO :LOOP
156+
)
157+
:LOOPEND
158+
159+
REM ~ remove the temp folder
160+
RMDIR "%TMPFOLDER%"
161+
162+
REM ~ setting the colors back to default
163+
COLOR
164+
165+
REM ~ determining the duration (with the help of https://stackoverflow.com/questions/42603119/arithmetic-operations-with-hhmmss-times-in-batch-file/42603985#42603985)
166+
SET EndPosition=%time:~0,8%
167+
SET /A "ss=(((1%EndPosition::=-100)*60+1%-100)-(((1%StartPosition::=-100)*60+1%-100)"
168+
SET /A "hh=ss/3600+100,ss%%=3600,mm=ss/60+100,ss=ss%%60+100"
169+
ECHO Duration: %hh:~1%:%mm:~1%:%ss:~1%
170+
ECHO ### END ###
171+
172+
:SCRIPTEND
173+
ENDLOCAL
174+
175+
REM ~ keep the command line window open
176+
CMD /k

0 commit comments

Comments
 (0)