compose_detailed parsing

PaulAsjes · PaulAsjes · commit 21375cc331f9 · 2025-08-20T16:30:10.000+02:00
diff --git a/src/elevenlabs/client.py b/src/elevenlabs/client.py
@@ -8,6 +8,7 @@
 from .environment import ElevenLabsEnvironment
 from .realtime_tts import RealtimeTextToSpeechClient
 from .webhooks_custom import WebhooksClient, AsyncWebhooksClient
+from .music_custom import MusicClient, AsyncMusicClient
 
 
 # this is used as the default value for optional parameters
@@ -59,6 +60,7 @@ def __init__(
         )
         self.text_to_speech = RealtimeTextToSpeechClient(client_wrapper=self._client_wrapper)
         self.webhooks = WebhooksClient(client_wrapper=self._client_wrapper)
+        self.music = MusicClient(client_wrapper=self._client_wrapper)
 
 
 class AsyncElevenLabs(AsyncBaseElevenLabs):
@@ -102,3 +104,4 @@ def __init__(
             httpx_client=httpx_client
         )
         self.webhooks = AsyncWebhooksClient(client_wrapper=self._client_wrapper)
+        self.music = AsyncMusicClient(client_wrapper=self._client_wrapper)
diff --git a/src/elevenlabs/music_custom.py b/src/elevenlabs/music_custom.py
@@ -0,0 +1,286 @@
+import typing
+import json
+import re
+from dataclasses import dataclass
+
+from elevenlabs.music.client import MusicClient as AutogeneratedMusicClient, AsyncMusicClient as AutogeneratedAsyncMusicClient
+from elevenlabs.types.music_prompt import MusicPrompt
+from elevenlabs.music.types.music_compose_detailed_request_output_format import MusicComposeDetailedRequestOutputFormat
+from elevenlabs.core.request_options import RequestOptions
+
+# this is used as the default value for optional parameters
+OMIT = typing.cast(typing.Any, ...)
+
+
+@dataclass
+class SongMetadata:
+    title: str
+    description: str
+    genres: typing.List[str]
+    languages: typing.List[str]
+    is_explicit: bool
+
+
+@dataclass
+class MultipartResponse:
+    json: typing.Dict[str, typing.Any]  # Contains compositionPlan and songMetadata
+    audio: bytes
+    filename: str
+
+
+class MusicClient(AutogeneratedMusicClient):
+    """
+    A client to handle ElevenLabs music-related functionality
+    Extends the autogenerated client to include custom music methods
+    """
+
+    def compose_detailed(
+        self,
+        *,
+        output_format: typing.Optional[MusicComposeDetailedRequestOutputFormat] = None,
+        prompt: typing.Optional[str] = OMIT,
+        music_prompt: typing.Optional[MusicPrompt] = OMIT,
+        composition_plan: typing.Optional[MusicPrompt] = OMIT,
+        music_length_ms: typing.Optional[int] = OMIT,
+        model_id: typing.Optional[typing.Literal["music_v1"]] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> MultipartResponse:
+        """
+        Compose a song from a prompt or a composition plan with detailed response parsing.
+        This method calls the original compose_detailed and then parses the stream response.
+
+        Returns a MultipartResponse containing parsed JSON metadata, audio bytes, and filename.
+        """
+        # Call the parent method to get the stream
+        stream = super().compose_detailed(
+            output_format=output_format,
+            prompt=prompt,
+            music_prompt=music_prompt,
+            composition_plan=composition_plan,
+            music_length_ms=music_length_ms,
+            model_id=model_id,
+            request_options=request_options,
+        )
+
+        # Parse the stream using the parsing method
+        return self._parse_multipart(stream)
+
+    def _parse_multipart(self, stream: typing.Iterator[bytes]) -> MultipartResponse:
+        """
+        Reads a byte stream containing multipart data and parses it into JSON and audio parts.
+
+        Args:
+            stream: Iterator of bytes from ElevenLabs music API response
+
+        Returns:
+            MultipartResponse containing parsed JSON metadata, audio bytes, and filename
+        """
+        # Collect all chunks into a single bytes object
+        chunks = []
+        for chunk in stream:
+            chunks.append(chunk)
+
+        # Combine all chunks into a single buffer
+        response_bytes = b''.join(chunks)
+
+        # Parse the multipart content
+        response_text = response_bytes.decode('utf-8', errors='ignore')
+        lines = response_text.split('\n')
+
+        if not lines:
+            raise ValueError("Empty response from music API")
+
+        boundary = lines[0].strip()
+
+        # Find the JSON part (should be early in the response)
+        json_data = None
+        filename = 'generated_music.mp3'
+
+        # Parse JSON from the text representation
+        for i in range(min(10, len(lines))):
+            if 'Content-Type: application/json' in lines[i] and i + 2 < len(lines):
+                json_line = lines[i + 2]
+                if json_line.strip() and json_line.startswith('{'):
+                    try:
+                        json_data = json.loads(json_line)
+                        print('✓ Successfully parsed JSON metadata')
+                    except json.JSONDecodeError as e:
+                        print(f'Failed to parse JSON: {e}')
+                    break
+
+        # Extract filename from headers
+        for i in range(min(20, len(lines))):
+            if 'filename=' in lines[i]:
+                match = re.search(r'filename="([^"]+)"', lines[i])
+                if match:
+                    filename = match.group(1)
+                    break
+
+        # Find where the audio data starts (after the second boundary and headers)
+        boundary_bytes = boundary.encode('utf-8')
+        first_boundary = -1
+        second_boundary = -1
+
+        for i in range(len(response_bytes) - len(boundary_bytes) + 1):
+            if response_bytes[i:i + len(boundary_bytes)] == boundary_bytes:
+                if first_boundary == -1:
+                    first_boundary = i
+                elif second_boundary == -1:
+                    second_boundary = i
+                    break
+
+        if second_boundary == -1:
+            raise ValueError('Could not find audio part boundary')
+
+        # Find the start of audio data (after headers and empty line)
+        audio_start = second_boundary + len(boundary_bytes)
+
+        # Skip past the headers to find the empty line (\n\n)
+        while audio_start < len(response_bytes) - 1:
+            if (response_bytes[audio_start] == 0x0A and
+                response_bytes[audio_start + 1] == 0x0A):
+                # Found \n\n - audio starts after this
+                audio_start += 2
+                break
+            audio_start += 1
+
+        # Audio goes until the end (or until we find another boundary)
+        audio_buffer = response_bytes[audio_start:]
+
+        if not json_data:
+            raise ValueError('Could not parse JSON data')
+
+        return MultipartResponse(
+            json=json_data,
+            audio=audio_buffer,
+            filename=filename
+        )
+
+
+class AsyncMusicClient(AutogeneratedAsyncMusicClient):
+    """
+    An async client to handle ElevenLabs music-related functionality
+    Extends the autogenerated async client to include custom music methods
+    """
+
+    async def compose_detailed(
+        self,
+        *,
+        output_format: typing.Optional[MusicComposeDetailedRequestOutputFormat] = None,
+        prompt: typing.Optional[str] = OMIT,
+        music_prompt: typing.Optional[MusicPrompt] = OMIT,
+        composition_plan: typing.Optional[MusicPrompt] = OMIT,
+        music_length_ms: typing.Optional[int] = OMIT,
+        model_id: typing.Optional[typing.Literal["music_v1"]] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> MultipartResponse:
+        """
+        Compose a song from a prompt or a composition plan with detailed response parsing.
+        This method calls the original compose_detailed and then parses the stream response.
+
+        Returns a MultipartResponse containing parsed JSON metadata, audio bytes, and filename.
+        """
+        # Call the parent method to get the stream
+        stream = super().compose_detailed(
+            output_format=output_format,
+            prompt=prompt,
+            music_prompt=music_prompt,
+            composition_plan=composition_plan,
+            music_length_ms=music_length_ms,
+            model_id=model_id,
+            request_options=request_options,
+        )
+
+        # Parse the stream using the parsing method
+        return await self._parse_multipart_async(stream)
+
+    async def _parse_multipart_async(self, stream: typing.AsyncIterator[bytes]) -> MultipartResponse:
+        """
+        Reads an async byte stream containing multipart data and parses it into JSON and audio parts.
+
+        Args:
+            stream: AsyncIterator of bytes from ElevenLabs music API response
+
+        Returns:
+            MultipartResponse containing parsed JSON metadata, audio bytes, and filename
+        """
+        # Collect all chunks into a single bytes object
+        chunks = []
+        async for chunk in stream:
+            chunks.append(chunk)
+
+        # Combine all chunks into a single buffer
+        response_bytes = b''.join(chunks)
+
+        # Parse the multipart content
+        response_text = response_bytes.decode('utf-8', errors='ignore')
+        lines = response_text.split('\n')
+
+        if not lines:
+            raise ValueError("Empty response from music API")
+
+        boundary = lines[0].strip()
+
+        # Find the JSON part (should be early in the response)
+        json_data = None
+        filename = 'generated_music.mp3'
+
+        # Parse JSON from the text representation
+        for i in range(min(10, len(lines))):
+            if 'Content-Type: application/json' in lines[i] and i + 2 < len(lines):
+                json_line = lines[i + 2]
+                if json_line.strip() and json_line.startswith('{'):
+                    try:
+                        json_data = json.loads(json_line)
+                        print('✓ Successfully parsed JSON metadata')
+                    except json.JSONDecodeError as e:
+                        print(f'Failed to parse JSON: {e}')
+                    break
+
+        # Extract filename from headers
+        for i in range(min(20, len(lines))):
+            if 'filename=' in lines[i]:
+                match = re.search(r'filename="([^"]+)"', lines[i])
+                if match:
+                    filename = match.group(1)
+                    break
+
+        # Find where the audio data starts (after the second boundary and headers)
+        boundary_bytes = boundary.encode('utf-8')
+        first_boundary = -1
+        second_boundary = -1
+
+        for i in range(len(response_bytes) - len(boundary_bytes) + 1):
+            if response_bytes[i:i + len(boundary_bytes)] == boundary_bytes:
+                if first_boundary == -1:
+                    first_boundary = i
+                elif second_boundary == -1:
+                    second_boundary = i
+                    break
+
+        if second_boundary == -1:
+            raise ValueError('Could not find audio part boundary')
+
+        # Find the start of audio data (after headers and empty line)
+        audio_start = second_boundary + len(boundary_bytes)
+
+        # Skip past the headers to find the empty line (\n\n)
+        while audio_start < len(response_bytes) - 1:
+            if (response_bytes[audio_start] == 0x0A and
+                response_bytes[audio_start + 1] == 0x0A):
+                # Found \n\n - audio starts after this
+                audio_start += 2
+                break
+            audio_start += 1
+
+        # Audio goes until the end (or until we find another boundary)
+        audio_buffer = response_bytes[audio_start:]
+
+        if not json_data:
+            raise ValueError('Could not parse JSON data')
+
+        return MultipartResponse(
+            json=json_data,
+            audio=audio_buffer,
+            filename=filename
+        )