Skip to content

Commit a71d85d

Browse files
authored
Merge pull request #2 from MEITREX/pdf
Extract text from pdfs and ppt files
2 parents a74e896 + 2d70812 commit a71d85d

6 files changed

Lines changed: 70 additions & 1 deletion

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
.DS_Store
12
venv
23
.vscode
34
*.mp3
45
*.mp4
56
*.wav
67
*.vtt
7-
__pycache__
8+
__pycache__

03 BayesReasoning.pdf

6.47 MB
Binary file not shown.

13-mlops-2.pdf

1.23 MB
Binary file not shown.

Blatt 5.pdf

151 KB
Binary file not shown.

src/fileextractlib/Pdf.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import os
2+
from pdf2image import convert_from_path
3+
import pytesseract
4+
from os import path
5+
import argparse
6+
7+
class PdfProcessor:
8+
"""
9+
Can be used to convert documents in pdf format into raw text
10+
"""
11+
12+
def process(self, file_name: str) -> str:
13+
14+
15+
doc = convert_from_path(file_name)
16+
os.environ['OMP_THREAD_LIMIT'] = '20'
17+
18+
pages = []
19+
20+
for page_number, page_data in enumerate(doc):
21+
text = pytesseract.image_to_string(page_data).encode("utf-8")
22+
pages.append({page_number, text})
23+
24+
return pages
25+
26+
27+
if __name__ == "__main__":
28+
parser = argparse.ArgumentParser()
29+
30+
parser.add_argument("--file")
31+
args = parser.parse_args()
32+
processor = PdfProcessor()
33+
result = processor.process(args.file)
34+
print(result)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import argparse
2+
import json
3+
from typing import Any
4+
5+
from pptx import Presentation
6+
7+
8+
class PowerPointProcessor:
9+
10+
def extract_text_from_pptx(self, filename: str) -> dict[Any, Any]:
11+
prs = Presentation(filename)
12+
text_runs = {}
13+
14+
for page, slide in enumerate(prs.slides):
15+
text = ""
16+
for shape in slide.shapes:
17+
if shape.has_text_frame and shape.text.strip():
18+
text += shape.text
19+
20+
text_runs[page + 1] = text
21+
22+
with open("test.json", "w", encoding="utf8") as file:
23+
json.dump(text_runs, file, indent=2, ensure_ascii=False)
24+
25+
return text_runs
26+
27+
28+
parser = argparse.ArgumentParser()
29+
parser.add_argument("--file")
30+
args = parser.parse_args()
31+
file_path = 'path_to_your_pptx_file.pptx'
32+
processor = PowerPointProcessor()
33+
text_content = processor.extract_text_from_pptx(args.file)
34+
print(text_content)

0 commit comments

Comments
 (0)