align media local path autodetection

nabinchha · nabinchha · commit f482dfb7e629 · 2026-05-22T11:24:16.000-06:00
diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb
@@ -311,7 +311,7 @@
     "]\n",
     "```\n",
     "\n",
-    "URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths in URL mode require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access."
+    "URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths require explicit URL mode and require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access."
    ]
   },
   {
diff --git a/docs/notebook_source/4-providing-images-as-context.py b/docs/notebook_source/4-providing-images-as-context.py
@@ -184,7 +184,7 @@ def convert_image_to_chat_format(record, height: int) -> dict:
 # ]
 # ```
 #
-# URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths in URL mode require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
+# URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths require explicit URL mode and require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
 
 # %%
 # Add a column to generate detailed image descriptions
diff --git a/fern/versions/latest/pages/concepts/models/default-model-settings.mdx b/fern/versions/latest/pages/concepts/models/default-model-settings.mdx
@@ -75,7 +75,7 @@ The following model configurations are automatically available when `OPENROUTER_
 | `openrouter-embedding` | `openai/text-embedding-3-large` | Text embeddings | `encoding_format="float"` |
 
 <Note title="Modality support depends on the model">
-  The `multi_modal_context` field can include image, audio, and video contexts, but each model/provider combination has its own accepted input formats, media-size limits, and modality mix. Use an image-capable model for image-only workflows, and use an omni or otherwise multimodal model before sending audio or video context. Local audio/video paths in URL mode require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
+  The `multi_modal_context` field can include image, audio, and video contexts, but each model/provider combination has its own accepted input formats, media-size limits, and modality mix. Use an image-capable model for image-only workflows, and use an omni or otherwise multimodal model before sending audio or video context. Local audio/video paths require explicit URL mode (`data_type=url`) and require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
 </Note>
 
 
diff --git a/fern/versions/latest/pages/concepts/models/model-configs.mdx b/fern/versions/latest/pages/concepts/models/model-configs.mdx
@@ -9,7 +9,7 @@ Model configurations define the specific models you use for synthetic data gener
 
 A `ModelConfig` specifies which LLM model to use and how it should behave during generation. When you create column configurations (like `LLMText`, `LLMCode`, or `LLMStructured`), you reference a model by its alias. Data Designer uses the model configuration to determine which model to call and with what parameters.
 
-When a column includes `multi_modal_context`, the `ModelConfig` alias must point to a model that supports the media types you send. Data Designer can serialize image, audio, and video context blocks, but model capability is still provider-specific. Local audio/video paths in URL mode require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
+When a column includes `multi_modal_context`, the `ModelConfig` alias must point to a model that supports the media types you send. Data Designer can serialize image, audio, and video context blocks, but model capability is still provider-specific. Local audio/video paths require explicit URL mode (`data_type=url`) and require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
 
 ## ModelConfig Structure
 
diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py
@@ -180,10 +180,9 @@ def _image_formats_match(configured_format: ImageFormat, detected_format: ImageF
 class AudioContext(ModalityContext):
     """Configuration for providing audio context to multimodal models.
 
-    Audio context values are URL, local path, or base64 media values. Local
-    paths are passed through so colocated vLLM servers can read them directly.
-    ``audio_format`` is consulted only for base64 sources; URL and local-path
-    sources are passed through unchanged.
+    Audio context values are URL or base64 media values. Local paths may be
+    passed through only in explicit URL mode so colocated model endpoints can
+    read them directly. ``audio_format`` is consulted only for base64 sources.
     """
 
     modality: Literal[Modality.AUDIO] = Modality.AUDIO
@@ -193,7 +192,7 @@ def get_contexts(self, record: dict, *, base_path: str | None = None) -> list[di
         """Get audio contexts.
 
         ``base_path`` is accepted for signature compatibility with ``ImageContext``
-        but unused; local audio paths are passed through unchanged.
+        but unused; audio contexts do not resolve local files to base64.
         """
         return [self._build_context(value) for value in normalize_media_context_values(record[self.column_name])]
 
@@ -202,7 +201,7 @@ def _build_context(self, context_value: Any) -> dict[str, Any]:
             self._validate_url_context_value(context_value)
             return get_media_url_context(Modality.AUDIO.value, context_value)
 
-        if self.data_type is None and (is_audio_path(context_value) or is_media_url(context_value)):
+        if self.data_type is None and is_media_url(context_value):
             return get_media_url_context(Modality.AUDIO.value, context_value)
 
         media_type, data = self._resolve_base64_parts(context_value)
@@ -223,8 +222,8 @@ def _resolve_base64_parts(self, context_value: Any) -> tuple[str, Any]:
 
         if is_audio_path(context_value):
             raise ValueError(
-                "audio base64 context values must be base64 audio data; use data_type=url "
-                "or omit data_type to pass local audio paths through"
+                "audio context values that look like local paths must use data_type=url; "
+                "otherwise provide base64 audio data"
             )
 
         if self.audio_format is None:
@@ -245,10 +244,9 @@ def _validate_audio_format(self) -> Self:
 class VideoContext(ModalityContext):
     """Configuration for providing video context to multimodal models.
 
-    Video context values are URL, local path, or base64 media values. Local
-    paths are passed through so colocated vLLM servers can read them directly.
-    ``video_format`` is consulted only for base64 sources; URL and local-path
-    sources are passed through unchanged.
+    Video context values are URL or base64 media values. Local paths may be
+    passed through only in explicit URL mode so colocated model endpoints can
+    read them directly. ``video_format`` is consulted only for base64 sources.
     """
 
     modality: Literal[Modality.VIDEO] = Modality.VIDEO
@@ -258,7 +256,7 @@ def get_contexts(self, record: dict, *, base_path: str | None = None) -> list[di
         """Get video contexts.
 
         ``base_path`` is accepted for signature compatibility with ``ImageContext``
-        but unused; local video paths are passed through unchanged.
+        but unused; video contexts do not resolve local files to base64.
         """
         return [self._build_context(value) for value in normalize_media_context_values(record[self.column_name])]
 
@@ -267,7 +265,7 @@ def _build_context(self, context_value: Any) -> dict[str, Any]:
             self._validate_url_context_value(context_value)
             return get_media_url_context(Modality.VIDEO.value, context_value)
 
-        if self.data_type is None and (is_video_path(context_value) or is_media_url(context_value)):
+        if self.data_type is None and is_media_url(context_value):
             return get_media_url_context(Modality.VIDEO.value, context_value)
 
         media_type, data = self._resolve_base64_parts(context_value)
@@ -288,8 +286,8 @@ def _resolve_base64_parts(self, context_value: Any) -> tuple[str, Any]:
 
         if is_video_path(context_value):
             raise ValueError(
-                "video base64 context values must be base64 video data; use data_type=url "
-                "or omit data_type to pass local video paths through"
+                "video context values that look like local paths must use data_type=url; "
+                "otherwise provide base64 video data"
             )
 
         if self.video_format is None:
diff --git a/packages/data-designer-config/tests/config/test_models.py b/packages/data-designer-config/tests/config/test_models.py
@@ -246,6 +246,9 @@ def test_audio_context_get_contexts_single_string() -> None:
     assert audio_context.get_contexts({"audio_url": "recordings/speech.mp3"}) == [
         get_media_url_context(Modality.AUDIO.value, "recordings/speech.mp3")
     ]
+    assert audio_context.get_contexts({"audio_url": "file:///data/recordings/speech.mp3"}) == [
+        get_media_url_context(Modality.AUDIO.value, "file:///data/recordings/speech.mp3")
+    ]
 
 
 def test_audio_context_get_contexts_list_json_and_numpy() -> None:
@@ -276,10 +279,6 @@ def test_audio_context_auto_detect_url_and_data_uri() -> None:
         get_media_url_context(Modality.AUDIO.value, "https://example.com/audio.mp3")
     ]
 
-    assert AudioContext(column_name="audio_col").get_contexts({"audio_col": "recordings/speech.wav"}) == [
-        get_media_url_context(Modality.AUDIO.value, "recordings/speech.wav")
-    ]
-
     assert AudioContext(column_name="audio_col").get_contexts({"audio_col": "https://example.com/download?id=123"}) == [
         get_media_url_context(Modality.AUDIO.value, "https://example.com/download?id=123")
     ]
@@ -289,6 +288,12 @@ def test_audio_context_auto_detect_url_and_data_uri() -> None:
     ]
 
 
+@pytest.mark.parametrize("audio_path", ["recordings/speech.wav", "file:///data/recordings/speech.mp3"])
+def test_audio_context_auto_detect_local_path_rejected(audio_path: str) -> None:
+    with pytest.raises(ValueError, match="audio context values that look like local paths must use data_type=url"):
+        AudioContext(column_name="audio_col").get_contexts({"audio_col": audio_path})
+
+
 def test_audio_context_validate_audio_format() -> None:
     with pytest.raises(ValueError, match="audio_format is required when data_type is base64"):
         AudioContext(column_name="audio_base64", data_type=ModalityDataType.BASE64)
@@ -304,11 +309,12 @@ def test_audio_context_validate_audio_format() -> None:
             {"audio_base64": "data:audio/mpeg;base64,audio1base64"}
         )
 
-    assert AudioContext(column_name="audio_base64", audio_format=AudioFormat.MP3).get_contexts(
-        {"audio_base64": "screen_recording.mp3"}
-    ) == [get_media_url_context(Modality.AUDIO.value, "screen_recording.mp3")]
+    with pytest.raises(ValueError, match="audio context values that look like local paths must use data_type=url"):
+        AudioContext(column_name="audio_base64", audio_format=AudioFormat.MP3).get_contexts(
+            {"audio_base64": "screen_recording.mp3"}
+        )
 
-    with pytest.raises(ValueError, match="audio base64 context values must be base64 audio data"):
+    with pytest.raises(ValueError, match="audio context values that look like local paths must use data_type=url"):
         AudioContext(
             column_name="audio_base64", data_type=ModalityDataType.BASE64, audio_format=AudioFormat.MP3
         ).get_contexts({"audio_base64": "screen_recording.mp3"})
@@ -329,6 +335,9 @@ def test_video_context_get_contexts_single_string() -> None:
     assert video_context.get_contexts({"video_url": "clips/screen_recording.mp4"}) == [
         get_media_url_context(Modality.VIDEO.value, "clips/screen_recording.mp4")
     ]
+    assert video_context.get_contexts({"video_url": "file:///data/clips/screen_recording.mp4"}) == [
+        get_media_url_context(Modality.VIDEO.value, "file:///data/clips/screen_recording.mp4")
+    ]
 
 
 def test_video_context_get_contexts_list_json_and_numpy() -> None:
@@ -359,10 +368,6 @@ def test_video_context_auto_detect_url_and_data_uri() -> None:
         get_media_url_context(Modality.VIDEO.value, "https://example.com/video.mp4")
     ]
 
-    assert VideoContext(column_name="video_col").get_contexts({"video_col": "clips/screen_recording.webm"}) == [
-        get_media_url_context(Modality.VIDEO.value, "clips/screen_recording.webm")
-    ]
-
     assert VideoContext(column_name="video_col").get_contexts({"video_col": "https://example.com/download?id=123"}) == [
         get_media_url_context(Modality.VIDEO.value, "https://example.com/download?id=123")
     ]
@@ -372,6 +377,12 @@ def test_video_context_auto_detect_url_and_data_uri() -> None:
     ]
 
 
+@pytest.mark.parametrize("video_path", ["clips/screen_recording.webm", "file:///data/clips/screen_recording.mp4"])
+def test_video_context_auto_detect_local_path_rejected(video_path: str) -> None:
+    with pytest.raises(ValueError, match="video context values that look like local paths must use data_type=url"):
+        VideoContext(column_name="video_col").get_contexts({"video_col": video_path})
+
+
 def test_video_context_validate_video_format() -> None:
     with pytest.raises(ValueError, match="video_format is required when data_type is base64"):
         VideoContext(column_name="video_base64", data_type=ModalityDataType.BASE64)
@@ -387,11 +398,12 @@ def test_video_context_validate_video_format() -> None:
             {"video_base64": "data:video/mp4;base64,video1base64"}
         )
 
-    assert VideoContext(column_name="video_base64", video_format=VideoFormat.MP4).get_contexts(
-        {"video_base64": "screen_recording.mp4"}
-    ) == [get_media_url_context(Modality.VIDEO.value, "screen_recording.mp4")]
+    with pytest.raises(ValueError, match="video context values that look like local paths must use data_type=url"):
+        VideoContext(column_name="video_base64", video_format=VideoFormat.MP4).get_contexts(
+            {"video_base64": "screen_recording.mp4"}
+        )
 
-    with pytest.raises(ValueError, match="video base64 context values must be base64 video data"):
+    with pytest.raises(ValueError, match="video context values that look like local paths must use data_type=url"):
         VideoContext(
             column_name="video_base64", data_type=ModalityDataType.BASE64, video_format=VideoFormat.MP4
         ).get_contexts({"video_base64": "screen_recording.mp4"})

Original file line number	Diff line number	Diff line change
`@@ -311,7 +311,7 @@`
`311`	`311`	`"]\n",`
`312`	`312`	"```\n",
`313`	`313`	`"\n",`
`314`		- "URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths in URL mode require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access."
	`314`	+ "URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths require explicit URL mode and require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access."
`315`	`315`	`]`
`316`	`316`	`},`
`317`	`317`	`{`
Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,7 @@ def convert_image_to_chat_format(record, height: int) -> dict:`
`184`	`184`	`# ]`
`185`	`185`	# ```
`186`	`186`	`#`
`187`		-# URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths in URL mode require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
	`187`	+# URL-backed media can use `data_type=dd.ModalityDataType.URL`, subject to the provider's URL support and file-size limits. Local audio/video paths require explicit URL mode and require the model endpoint to have filesystem access to the same paths, typically a colocated vLLM server configured for local media access.
`188`	`188`
`189`	`189`	`# %%`
`190`	`190`	`# Add a column to generate detailed image descriptions`