Merge pull request #91 from dreadnode/feature/multimodal-extras-remove-pydub

rdheekonda · web-flow · commit 3841b1362833 · 2025-06-26T13:23:44.000-07:00
feat: add multimodal extras and remove pydub for Python 3.13 compatibility
diff --git a/README.md b/README.md
@@ -85,8 +85,33 @@ pip install -U dreadnode
 ```
 
 If you want to build from source:
+
 ```bash
+
 poetry install
+# Install with multimodal extras
+poetry install --extras multimodal
+
+# Install with training extras
+poetry install --extras training
+
+# Install with all extras
+poetry install --all-extras
+```
+
+## Installation from PyPI with Optional Features
+
+For advanced media processing capabilities (audio, video, images), install the multimodal extras:
+
+```bash
+# Multimodal support (audio, video processing)
+pip install -U dreadnode[multimodal]
+
+# Training support (ML model integration)
+pip install -U dreadnode[training]
+
+# All optional features
+pip install -U dreadnode[all]
 ```
 
 See our **[installation guide](https://docs.dreadnode.io/strikes/install)** for more options.
diff --git a/docs/sdk/data_types.mdx b/docs/sdk/data_types.mdx
@@ -24,7 +24,6 @@ Supports:
 - Local file paths (str or Path)
 - Numpy arrays with sample rate
 - Raw bytes
-- Pydub AudioSegment object
 
 Initialize an Audio object.
 
@@ -36,7 +35,6 @@ Initialize an Audio object.
   - A path to a local audio file (str or Path)
   - A numpy array (requires sample\_rate)
   - Raw bytes
-  - A pydub AudioSegment
 * **`sample_rate`**
   (`int | None`, default:
   `None`
@@ -70,7 +68,6 @@ def __init__(
             - A path to a local audio file (str or Path)
             - A numpy array (requires sample_rate)
             - Raw bytes
-            - A pydub AudioSegment
         sample_rate: Required when using numpy arrays
         caption: Optional caption for the audio
         format: Optional format to use (default is wav for numpy arrays)
diff --git a/docs/usage/rich-objects.mdx b/docs/usage/rich-objects.mdx
@@ -76,17 +76,6 @@ with dn.run("audio-example-numpy"):
     dn.log_input("my-audio", dn.Audio(audio_data, sample_rate=sample_rate))
 ```
 
-```python AudioSegment
-import dreadnode as dn
-from pydub import AudioSegment
-
-# Load audio with pydub
-audio_segment = AudioSegment.from_file("path/to/audio.mp3")
-
-with dn.run("audio-example-segment"):
-    dn.log_input("my-audio", dn.Audio(audio_segment))
-```
-
 ```python Raw Bytes
 import dreadnode as dn
 
diff --git a/dreadnode/data_types/audio.py b/dreadnode/data_types/audio.py
@@ -3,12 +3,15 @@
 from pathlib import Path
 
 import numpy as np
-import soundfile as sf  # type: ignore  # noqa: PGH003
-from pydub import AudioSegment  # type: ignore  # noqa: PGH003
+
+try:
+    import soundfile as sf  # type: ignore  # noqa: PGH003
+except ImportError:
+    sf = None
 
 from dreadnode.data_types.base_data_type import BaseDataType
 
-AudioDataType: t.TypeAlias = str | Path | np.ndarray[t.Any, t.Any] | bytes | AudioSegment
+AudioDataType: t.TypeAlias = str | Path | np.ndarray[t.Any, t.Any] | bytes
 
 
 class Audio(BaseDataType):
@@ -19,7 +22,6 @@ class Audio(BaseDataType):
     - Local file paths (str or Path)
     - Numpy arrays with sample rate
     - Raw bytes
-    - Pydub AudioSegment object
     """
 
     def __init__(
@@ -37,11 +39,15 @@ def __init__(
                 - A path to a local audio file (str or Path)
                 - A numpy array (requires sample_rate)
                 - Raw bytes
-                - A pydub AudioSegment
             sample_rate: Required when using numpy arrays
             caption: Optional caption for the audio
             format: Optional format to use (default is wav for numpy arrays)
         """
+        if sf is None:
+            raise ImportError(
+                "Audio processing requires optional dependencies. "
+                "Install with: pip install dreadnode[multimodal]"
+            )
         self._data = data
         self._sample_rate = sample_rate
         self._caption = caption
@@ -69,8 +75,6 @@ def _process_audio_data(self) -> tuple[bytes, str, int | None, float | None]:
             return self._process_numpy_array()
         if isinstance(self._data, bytes):
             return self._process_raw_bytes()
-        if isinstance(self._data, AudioSegment):
-            return self._process_pydub_audio_segment()
         raise TypeError(f"Unsupported audio data type: {type(self._data)}")
 
     def _process_file_path(self) -> tuple[bytes, str, int | None, float | None]:
@@ -123,29 +127,6 @@ def _process_raw_bytes(self) -> tuple[bytes, str, int | None, float | None]:
             raise TypeError("Raw bytes are expected for this processing method.")
         return self._data, format_name, self._sample_rate, None
 
-    def _process_pydub_audio_segment(self) -> tuple[bytes, str, int | None, float | None]:
-        """
-        Process pydub AudioSegment to bytes.
-        Returns:
-            A tuple of (audio_bytes, format_name, sample_rate, duration)
-        """
-
-        if not isinstance(self._data, AudioSegment):
-            raise TypeError("AudioSegment is expected for this processing method.")
-
-        sample_rate = self._data.frame_rate
-
-        buffer = io.BytesIO()
-        format_name = self._format or "wav"
-        self._data.export(buffer, format=format_name)
-        buffer.seek(0)
-        audio_bytes = buffer.read()
-
-        # PyDUB provides duration in milliseconds, convert to seconds for consistency
-        duration = len(self._data) / 1000.0
-
-        return audio_bytes, format_name, sample_rate, duration
-
     def _generate_metadata(
         self, format_name: str, sample_rate: int | None, duration: float | None
     ) -> dict[str, str | int | float | None]:
@@ -166,20 +147,13 @@ def _generate_metadata(
             metadata["source-type"] = "numpy.ndarray"
         elif isinstance(self._data, bytes):
             metadata["source-type"] = "bytes"
-        elif isinstance(self._data, AudioSegment):
-            metadata["source-type"] = "pydub.AudioSegment"
 
         if sample_rate is not None:
             metadata["sample-rate"] = sample_rate
 
         if duration is not None:
             metadata["duration"] = duration
 
-        # Add pydub-specific metadata if available
-        if isinstance(self._data, AudioSegment):
-            metadata["channels"] = self._data.channels
-            metadata["sample-width"] = self._data.sample_width
-
         if self._caption:
             metadata["caption"] = self._caption
 
diff --git a/dreadnode/data_types/image.py b/dreadnode/data_types/image.py
@@ -4,11 +4,15 @@
 from pathlib import Path
 
 import numpy as np
-from PIL import Image as PILImage
 
 from dreadnode.data_types.base_data_type import BaseDataType
 
-ImageDataType = PILImage.Image | np.ndarray[t.Any, t.Any]
+try:
+    from PIL import Image as PILImage
+except ImportError:
+    PILImage = None  # type: ignore[assignment]
+
+ImageDataType = t.Any | np.ndarray[t.Any, t.Any]
 ImageDataOrPathType = str | Path | bytes | ImageDataType
 
 
@@ -44,6 +48,10 @@ def __init__(
             caption: Optional caption for the image
             format: Optional format to use when saving (png, jpg, etc.)
         """
+        if PILImage is None:
+            raise ImportError(
+                "Image processing requires PIL (Pillow). Install with: pip install dreadnode[multimodal]"
+            )
         self._data = data
         self._mode = mode
         self._caption = caption
diff --git a/dreadnode/data_types/video.py b/dreadnode/data_types/video.py
@@ -4,13 +4,19 @@
 from pathlib import Path
 
 import numpy as np
-from moviepy.video.io.ImageSequenceClip import ImageSequenceClip  # type: ignore  # noqa: PGH003
-from moviepy.video.VideoClip import VideoClip  # type: ignore  # noqa: PGH003
 from numpy.typing import NDArray
 
 from dreadnode.data_types.base_data_type import BaseDataType
 
-VideoDataType: t.TypeAlias = str | Path | NDArray[t.Any] | bytes | list[NDArray[t.Any]] | VideoClip
+try:
+    from moviepy.video.io.ImageSequenceClip import ImageSequenceClip  # type: ignore  # noqa: PGH003
+    from moviepy.video.VideoClip import VideoClip  # type: ignore  # noqa: PGH003
+except ImportError:
+    ImageSequenceClip = None
+    VideoClip = None
+
+
+VideoDataType: t.TypeAlias = str | Path | NDArray[t.Any] | bytes | list[NDArray[t.Any]] | t.Any
 
 
 class Video(BaseDataType):
@@ -70,8 +76,13 @@ def to_serializable(self) -> tuple[bytes, dict[str, t.Any]]:
             return self._process_bytes()
         if isinstance(self._data, (np.ndarray, list)):
             return self._process_numpy_array()
-        if isinstance(self._data, VideoClip):
+        if VideoClip is not None and isinstance(self._data, VideoClip):
             return self._process_moviepy_clip()
+        if VideoClip is None and hasattr(self._data, "write_videofile"):
+            raise ImportError(
+                "MoviePy VideoClip detected but moviepy not installed. "
+                "Install with: pip install dreadnode[multimodal]"
+            )
         raise TypeError(f"Unsupported video data type: {type(self._data)}")
 
     def _process_file_path(self) -> tuple[bytes, dict[str, t.Any]]:
@@ -110,13 +121,31 @@ def _process_numpy_array(self) -> tuple[bytes, dict[str, t.Any]]:
         Returns:
             A tuple of (video_bytes, metadata_dict)
         """
+        if ImageSequenceClip is None:
+            raise ImportError(
+                "Video processing from numpy arrays requires moviepy. "
+                "Install with: pip install dreadnode[multimodal]"
+            )
         if not self._fps:
             raise ValueError("fps is required for numpy array video frames")
         if not isinstance(self._data, (np.ndarray, list)):
             raise TypeError("data must be a numpy array or list of numpy arrays")
+
+        # Type guard for mypy
+        assert ImageSequenceClip is not None  # noqa: S101
+
+        frames = self._extract_frames_from_data()
+        if not frames:
+            raise ValueError("No frames found in input data")
+
+        return self._create_video_from_frames_data(frames)
+
+    def _extract_frames_from_data(self) -> list[NDArray[t.Any]]:
+        """Extract frames from numpy array or list data."""
         frames = []
         rgb_dim = 3
         rgba_dim = 4
+
         if isinstance(self._data, np.ndarray):
             if self._data.ndim == rgb_dim:  # Single frame
                 frames = [self._data]
@@ -127,23 +156,23 @@ def _process_numpy_array(self) -> tuple[bytes, dict[str, t.Any]]:
         elif isinstance(self._data, list):
             frames = self._data
 
-        if not frames:
-            raise ValueError("No frames found in input data")
+        return frames
 
+    def _create_video_from_frames_data(
+        self, frames: list[NDArray[t.Any]]
+    ) -> tuple[bytes, dict[str, t.Any]]:
+        """Create video file from frames."""
         frame_height, frame_width = frames[0].shape[:2]
-
         temp_fd, temp_path = tempfile.mkstemp(suffix=f".{self._format}")
         os.close(temp_fd)
 
         try:
             # Create clip and write to file
             clip = ImageSequenceClip(frames, fps=self._fps)
-
             clip.write_videofile(
                 temp_path,
                 fps=self._fps,
             )
-
             video_bytes = Path(temp_path).read_bytes()
 
             metadata = self._generate_metadata(self._format)
diff --git a/examples/log_object/audio.ipynb b/examples/log_object/audio.ipynb
@@ -6,14 +6,16 @@
    "source": [
     "# Dreadnode Audio Logging\n",
     "\n",
-    "This notebook demonstrates how to log audio data using Dreadnode's `Audio` data type. The examples cover various audio formats and sources including file paths, numpy arrays, and pydub AudioSegment objects.\n",
+    "This notebook demonstrates how to log audio data using Dreadnode's `Audio` data type. The examples cover various audio formats and sources including file paths, and numpy arrays.\n",
     "\n",
     "## Features\n",
     "\n",
     "- Log audio files directly from disk (WAV, MP3, etc.)\n",
     "- Convert and log numpy arrays as audio\n",
-    "- Process and log pydub AudioSegment objects\n",
-    "- Add captions and metadata to audio logs"
+    "- Add captions and metadata to audio logs\n",
+    "\n",
+    "⚠️ Note: Ensure you have installed the multimodal extras to use the Audio data type:\n",
+    "`pip install dreadnode[multimodal]`"
    ]
   },
   {
@@ -128,32 +130,6 @@
     "    dn.log_input(\"stereo_sine\", Audio(stereo, sample_rate=sample_rate, caption=\"Stereo audio (440 Hz left, 880 Hz right)\"))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Pydub AudioSegment Examples\n",
-    "\n",
-    "Pydub is a popular library for audio manipulation in Python. Dreadnode supports logging AudioSegment objects directly, which enables powerful audio processing before logging."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pydub import AudioSegment\n",
-    "from pydub.generators import Sine\n",
-    "\n",
-    "with dn.run(\"audio_pydub_examples\") as r:\n",
-    "    # Load the file with pydub\n",
-    "    audio_segment = AudioSegment.from_file(audio_file_path)\n",
-    "    \n",
-    "    # Log the original AudioSegment\n",
-    "    dn.log_input(\"pydub_original\", Audio(audio_segment, caption=\"Original audio with pydub\"))\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -181,7 +157,6 @@
     "\n",
     "1. Audio files (e.g., WAV, MP3 files)\n",
     "2. Numpy arrays with sample rate\n",
-    "3. Pydub AudioSegment objects\n",
     "4. Audio with custom metadata and captions\n",
     "\n",
     "We also showed more advanced audio processing techniques including:\n",
@@ -194,7 +169,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "dreadnode-py3.12",
    "language": "python",
    "name": "python3"
   },
diff --git a/examples/log_object/image.ipynb b/examples/log_object/image.ipynb
@@ -14,7 +14,10 @@
     "- Convert and log PIL Image objects\n",
     "- Transform numpy arrays into images\n",
     "- Handle raw bytes and base64 encoded images\n",
-    "- Convert between image modes (RGB, RGBA, grayscale)"
+    "- Convert between image modes (RGB, RGBA, grayscale)\n",
+    "\n",
+    "⚠️ Note: Ensure you have installed the multimodal extras to use the Video data type:\n",
+    "`pip install dreadnode[multimodal]`"
    ]
   },
   {
@@ -265,7 +268,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
diff --git a/examples/log_object/video.ipynb b/examples/log_object/video.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml