MEITREX
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎03 BayesReasoning.pdf‎
6.47 MB b/‎03 BayesReasoning.pdf‎
6.47 MB
diff --git a/‎13-mlops-2.pdf‎
1.23 MB b/‎13-mlops-2.pdf‎
1.23 MB
diff --git a/‎Blatt 5.pdf‎
151 KB b/‎Blatt 5.pdf‎
151 KB
diff --git a/‎src/fileextractlib/Pdf.py‎
Lines changed: 34 additions & 0 deletions b/‎src/fileextractlib/Pdf.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/fileextractlib/PowerPointProcessor.py‎
Lines changed: 34 additions & 0 deletions b/‎src/fileextractlib/PowerPointProcessor.py‎
Lines changed: 34 additions & 0 deletions
@@ -1,7 +1,8 @@
+.DS_Store
 venv
 .vscode
 *.mp3
 *.mp4
 *.wav
 *.vtt
-__pycache__
+__pycache__
@@ -0,0 +1,34 @@
+import os
+from pdf2image import convert_from_path
+import pytesseract
+from os import path
+import argparse
+
+class PdfProcessor:
+    """
+     Can be used to convert documents in pdf format into raw text
+    """
+
+    def process(self, file_name: str) -> str:
+
+            
+        doc = convert_from_path(file_name)
+        os.environ['OMP_THREAD_LIMIT'] = '20'
+
+        pages = []
+
+        for page_number, page_data in enumerate(doc):
+            text = pytesseract.image_to_string(page_data).encode("utf-8")
+            pages.append({page_number, text})
+        
+        return pages
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--file")
+    args = parser.parse_args()
+    processor = PdfProcessor()
+    result = processor.process(args.file)
+    print(result)
@@ -0,0 +1,34 @@
+import argparse
+import json
+from typing import Any
+
+from pptx import Presentation
+
+
+class PowerPointProcessor:
+
+    def extract_text_from_pptx(self, filename: str) -> dict[Any, Any]:
+        prs = Presentation(filename)
+        text_runs = {}
+
+        for page, slide in enumerate(prs.slides):
+            text = ""
+            for shape in slide.shapes:
+                if shape.has_text_frame and shape.text.strip():
+                    text += shape.text
+
+                    text_runs[page + 1] = text
+
+        with open("test.json", "w", encoding="utf8") as file:
+            json.dump(text_runs, file, indent=2, ensure_ascii=False)
+
+        return text_runs
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--file")
+args = parser.parse_args()
+file_path = 'path_to_your_pptx_file.pptx'
+processor = PowerPointProcessor()
+text_content = processor.extract_text_from_pptx(args.file)
+print(text_content)