apache
diff --git a/‎src/main/python/systemds/scuro/dataloader/audio_loader.py‎
Lines changed: 4 additions & 5 deletions b/‎src/main/python/systemds/scuro/dataloader/audio_loader.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/base_loader.py‎
Lines changed: 3 additions & 4 deletions b/‎src/main/python/systemds/scuro/dataloader/base_loader.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/image_loader.py‎
Lines changed: 2 additions & 2 deletions b/‎src/main/python/systemds/scuro/dataloader/image_loader.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/json_loader.py‎
Lines changed: 3 additions & 1 deletion b/‎src/main/python/systemds/scuro/dataloader/json_loader.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/main/python/systemds/scuro/dataloader/pdf_loader.py‎
Lines changed: 70 additions & 0 deletions b/‎src/main/python/systemds/scuro/dataloader/pdf_loader.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/text_loader.py‎
Lines changed: 2 additions & 2 deletions b/‎src/main/python/systemds/scuro/dataloader/text_loader.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/timeseries_loader.py‎
Lines changed: 10 additions & 5 deletions b/‎src/main/python/systemds/scuro/dataloader/timeseries_loader.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/transcript_loader.py‎
Lines changed: 59 additions & 0 deletions b/‎src/main/python/systemds/scuro/dataloader/transcript_loader.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎src/main/python/systemds/scuro/dataloader/video_loader.py‎
Lines changed: 4 additions & 2 deletions b/‎src/main/python/systemds/scuro/dataloader/video_loader.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/main/python/systemds/scuro/modality/joined.py‎
Lines changed: 7 additions & 10 deletions b/‎src/main/python/systemds/scuro/modality/joined.py‎
Lines changed: 7 additions & 10 deletions
@@ -63,18 +63,17 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         if not self.load_data_from_file:
             import numpy as np
 
-            self.metadata[file] = self.modality_type.create_metadata(
-                1000, np.array([0])
-            )
+            audio = np.array([0])
+            sr = 1000
         else:
             audio, sr = librosa.load(file, dtype=self._data_type)
 
             if self.normalize:
                 audio = librosa.util.normalize(audio)
 
-            self.metadata[file] = self.modality_type.create_metadata(sr, audio)
+        self.metadata.append(self.modality_type.create_metadata(sr, audio))
 
-            self.data.append(audio)
+        self.data.append(audio)
 
     def get_stats(self, source_path: str):
         sampling_rate = 0
 
@@ -44,9 +44,7 @@ def __init__(
         (otherwise please provide your own Dataloader that knows about the file name convention)
         """
         self.data = []
-        self.metadata = (
-            {}
-        )  # TODO: check what the index should be for storing the metadata (file_name, counter, ...)
+        self.metadata = []
         self.source_path = source_path
         self.indices = indices
         self.modality_type = modality_type
@@ -87,7 +85,7 @@ def data_type(self, data_type):
     def reset(self):
         self._next_chunk = 0
         self.data = []
-        self.metadata = {}
+        self.metadata = []
 
     def load(self):
         """
@@ -134,6 +132,7 @@ def _load_next_chunk(self):
         Loads the next chunk of data
         """
         self.data = []
+        # TODO: Handle metadata correctly
         next_chunk_indices = self.indices[
             self._next_chunk
             * self._chunk_size : (self._next_chunk + 1)
 
@@ -71,8 +71,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
 
         image = image.astype(np.uint8, copy=False)
 
-        self.metadata[file] = self.modality_type.create_metadata(
-            width, height, channels
+        self.metadata.append(
+            self.modality_type.create_metadata(width, height, channels)
         )
 
         self.data.append(image)
 
@@ -69,7 +69,9 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
 
                 text = " ".join(text) if isinstance(text, list) else text
                 self.data.append(text)
-                self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
+                self.metadata.append(
+                    self.modality_type.create_metadata(len(text), text) | json_file[idx]
+                )
 
     def get_stats(self, source_path: str):
         self.file_sanity_check(source_path)
 
@@ -0,0 +1,70 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+import pymupdf
+
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
+from systemds.scuro.modality.type import ModalityType
+
+
+class PdfLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float16,
+        chunk_size: Optional[int] = None,
+        load=True,
+        ext=".pdf",
+    ):
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.IMAGE, ext
+        )
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+
+        doc = pymupdf.open(file)
+
+        for i, page in enumerate(doc.pages()):
+            image_bytes = page.get_pixmap().tobytes("jpg")
+            np_buffer = np.frombuffer(image_bytes, dtype=np.uint8)
+
+            image = cv2.imdecode(np_buffer, cv2.IMREAD_COLOR)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+            if image.ndim == 2:
+                height, width = image.shape
+                channels = 1
+            else:
+                height, width, channels = image.shape
+
+            image = image.astype(np.uint8, copy=False)
+
+            self.metadata.append(
+                self.modality_type.create_metadata(width, height, channels)
+            )
+
+            self.data.append(image)
@@ -56,8 +56,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
-                self.metadata[file] = self.modality_type.create_metadata(
-                    len(line.split()), line
+                self.metadata.append(
+                    self.modality_type.create_metadata(len(line.split()), line)
                 )
                 self.data.append(line)
 
 
@@ -81,15 +81,20 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             data = self._normalize_signals(data)
 
         if file:
-            self.metadata[index] = self.modality_type.create_metadata(
-                self.signal_names, data, self.sampling_rate
+            self.metadata.append(
+                self.modality_type.create_metadata(
+                    self.signal_names, data, self.sampling_rate
+                )
             )
+            self.data.append(data)
         else:
             for i, index in enumerate(self.indices):
-                self.metadata[str(index)] = self.modality_type.create_metadata(
-                    self.signal_names, data[i], self.sampling_rate
+                self.metadata.append(
+                    self.modality_type.create_metadata(
+                        self.signal_names, data[i], self.sampling_rate
+                    )
                 )
-        self.data.append(data)
+                self.data.append(data[i])
 
     def _normalize_signals(self, data: np.ndarray) -> np.ndarray:
         if data.ndim == 1:
 
@@ -0,0 +1,59 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+from faster_whisper import WhisperModel
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TranscriptLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float32,
+        chunk_size: Optional[int] = None,
+        normalize: bool = True,
+        transcribe_model_size: str = "medium",
+        load=True,
+    ):
+        super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
+        self.model = WhisperModel(
+            transcribe_model_size, device="cpu", compute_type="int8"
+        )
+        self.normalize = normalize
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+        segments, _ = self.model.transcribe(file, vad_filter=True)
+
+        for i, seg in enumerate(segments):
+            md = self.modality_type.create_metadata(len(seg.text.split()), seg.text)
+            md["timestamp_start"] = seg.start
+            md["timestamp_end"] = seg.end
+            md["text"] = seg.text
+
+            self.metadata.append(md)
+
+            self.data.append(seg.text)
@@ -87,8 +87,10 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         num_channels = 3
 
-        self.metadata[file] = self.modality_type.create_metadata(
-            self.fps, length, width, height, num_channels
+        self.metadata.append(
+            self.modality_type.create_metadata(
+                self.fps, length, width, height, num_channels
+            )
         )
 
         frames = []
 
@@ -77,9 +77,8 @@ def execute(self, starting_idx=0):
             )
 
         for i in range(start, end):
-            idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][
-                self.condition.leftField
-            ]
+            left_meta_idx = i if self.chunk_left else i + starting_idx
+            idx_1 = self.left_modality.metadata[left_meta_idx][self.condition.leftField]
             if (
                 self.condition.alignment is None and self.condition.join_type == "<"
             ):  # TODO compute correct alignment timestamps/spatial params
@@ -90,9 +89,7 @@ def execute(self, starting_idx=0):
             if self.chunk_left:
                 i = i + starting_idx
 
-            idx_2 = list(self.right_modality.metadata.values())[i][
-                self.condition.rightField
-            ]
+            idx_2 = self.right_modality.metadata[i][self.condition.rightField]
             self.joined_right.data.append([])
 
             c = 0
@@ -228,8 +225,8 @@ def _handle_chunked_execution(self, representation):
     def _apply_representation_chunked(
         self, left_modality, right_modality, chunk_right, representation
     ):
-        new_left = Modality(left_modality.modality_type, {})
-        new_right = Modality(right_modality.modality_type, {})
+        new_left = Modality(left_modality.modality_type)
+        new_right = Modality(right_modality.modality_type)
 
         for _ in left_modality.iter_raw_data_chunks(reset=True):
             if chunk_right:
@@ -246,11 +243,11 @@ def _apply_representation_chunked(
                 self.joined_right, representation
             )
             new_right.data.extend(right_transformed.data)
-            new_right.metadata.update(right_transformed.metadata)
+            new_right.metadata.extend(right_transformed.metadata)
 
             left_transformed = self._apply_representation(left_modality, representation)
             new_left.data.extend(left_transformed.data)
-            new_left.metadata.update(left_transformed.metadata)
+            new_left.metadata.extend(left_transformed.metadata)
 
         new_left.update_metadata()
         new_right.update_metadata()
Original file line number	Diff line number	Diff line change
`@@ -71,8 +71,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):`
`71`	`71`
`72`	`72`	`image = image.astype(np.uint8, copy=False)`
`73`	`73`
`74`		`- self.metadata[file] = self.modality_type.create_metadata(`
`75`		`- width, height, channels`
	`74`	`+ self.metadata.append(`
	`75`	`+ self.modality_type.create_metadata(width, height, channels)`
`76`	`76`	`)`
`77`	`77`
`78`	`78`	`self.data.append(image)`