diff --git a/amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala b/amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala
index 8111e442901..70684730cfa 100644
--- a/amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala
+++ b/amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala
@@ -35,7 +35,7 @@ import java.util.Optional
object UserAuthenticator extends Authenticator[JwtContext, SessionUser] with LazyLogging {
override def authenticate(context: JwtContext): Optional[SessionUser] = {
try {
- Optional.of(JwtParser.claimsToSessionUser(context.getJwtClaims))
+ JwtParser.claimsToOptionalSessionUser(context.getJwtClaims)
} catch {
case e: Exception =>
logger.error("Failed to authenticate the JwtContext", e)
diff --git a/common/auth/src/main/scala/org/apache/texera/auth/JwtParser.scala b/common/auth/src/main/scala/org/apache/texera/auth/JwtParser.scala
index bb139e7093a..8a4225df21e 100644
--- a/common/auth/src/main/scala/org/apache/texera/auth/JwtParser.scala
+++ b/common/auth/src/main/scala/org/apache/texera/auth/JwtParser.scala
@@ -38,7 +38,7 @@ object JwtParser extends LazyLogging {
/** Verify and parse a Bearer token string. */
def parseToken(token: String): Optional[SessionUser] = {
try {
- Optional.of(claimsToSessionUser(JwtAuth.jwtConsumer.processToClaims(token)))
+ claimsToOptionalSessionUser(JwtAuth.jwtConsumer.processToClaims(token))
} catch {
case _: UnresolvableKeyException =>
logger.error("Invalid JWT Signature")
@@ -49,6 +49,19 @@ object JwtParser extends LazyLogging {
}
}
+ /** Convert already-verified claims to a [[SessionUser]], returning empty when
+ * the required Texera custom claims are missing or malformed.
+ */
+ def claimsToOptionalSessionUser(claims: JwtClaims): Optional[SessionUser] = {
+ try {
+ Optional.of(claimsToSessionUser(claims))
+ } catch {
+ case e: IllegalArgumentException =>
+ logger.error(s"Invalid JWT claims: ${e.getMessage}")
+ Optional.empty()
+ }
+ }
+
/** Build a [[SessionUser]] from already-verified claims. Used by both
* [[parseToken]] (which verifies then calls this) and amber's
* `UserAuthenticator` (which the toastshaman filter calls after its own
@@ -59,8 +72,12 @@ object JwtParser extends LazyLogging {
val email = claims.getClaimValue("email", classOf[String])
// jose4j returns Long after JSON round-trip but the original setClaim
// call writes Integer; widen via Number to handle both cases.
- val userId = claims.getClaimValue("userId", classOf[Number]).intValue()
- val role = UserRoleEnum.valueOf(claims.getClaimValue("role").asInstanceOf[String])
+ val userId = Option(claims.getClaimValue("userId", classOf[Number]))
+ .map(_.intValue())
+ .getOrElse(throw new IllegalArgumentException("JWT claim 'userId' is required."))
+ val roleName = Option(claims.getClaimValue("role", classOf[String]))
+ .getOrElse(throw new IllegalArgumentException("JWT claim 'role' is required."))
+ val role = UserRoleEnum.valueOf(roleName)
val googleId = claims.getClaimValue("googleId", classOf[String])
val googleAvatar = claims.getClaimValue("googleAvatar", classOf[String])
val user = new User(
diff --git a/common/auth/src/test/scala/org/apache/texera/auth/JwtParserSpec.scala b/common/auth/src/test/scala/org/apache/texera/auth/JwtParserSpec.scala
index dc91de4d645..aa2e0c0423c 100644
--- a/common/auth/src/test/scala/org/apache/texera/auth/JwtParserSpec.scala
+++ b/common/auth/src/test/scala/org/apache/texera/auth/JwtParserSpec.scala
@@ -74,6 +74,18 @@ class JwtParserSpec extends AnyFlatSpec with Matchers {
u.getGoogleAvatar shouldBe "avatar-blob"
}
+ it should "return empty when already-verified claims are missing userId" in {
+ val claims = buildClaims()
+ claims.unsetClaim("userId")
+ JwtParser.claimsToOptionalSessionUser(claims).isPresent shouldBe false
+ }
+
+ it should "return empty when already-verified claims are missing role" in {
+ val claims = buildClaims()
+ claims.unsetClaim("role")
+ JwtParser.claimsToOptionalSessionUser(claims).isPresent shouldBe false
+ }
+
"JwtParser.parseToken" should "return empty on a structurally invalid token" in {
JwtParser.parseToken("not-a-real-jwt").isPresent shouldBe false
}
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
index 5f203717d1a..0e1062c75a4 100644
--- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
@@ -25,9 +25,12 @@ import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity}
import org.apache.texera.amber.operator.PythonOperatorDescriptor
import org.apache.texera.amber.operator.huggingFace.codegen.{
+ AudioTaskCodegen,
CodegenContext,
ImageTaskCodegen,
+ MediaGenCodegen,
PythonCodegenBase,
+ QaRankingCodegen,
TaskCodegen,
TextGenCodegen
}
@@ -95,6 +98,36 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
@AutofillAttributeName
var inputImageColumn: EncodableString = ""
+ @JsonProperty(value = "audioInput", required = false)
+ @JsonSchemaTitle("Audio Upload")
+ @JsonPropertyDescription("Upload audio for Hugging Face audio tasks")
+ var audioInput: EncodableString = ""
+
+ @JsonProperty(value = "inputAudioColumn", required = false)
+ @JsonSchemaTitle("Input Audio Column")
+ @JsonPropertyDescription("Column containing audio data from the input table")
+ @AutofillAttributeName
+ var inputAudioColumn: EncodableString = ""
+
+ @JsonProperty(value = "contextColumn", required = false)
+ @JsonSchemaTitle("Context Column")
+ @JsonPropertyDescription("Column containing the context passage for question answering")
+ @AutofillAttributeName
+ var contextColumn: EncodableString = ""
+
+ @JsonProperty(value = "candidateLabels", required = false)
+ @JsonSchemaTitle("Candidate Labels")
+ @JsonPropertyDescription("Comma-separated candidate labels for zero-shot classification")
+ var candidateLabels: EncodableString = ""
+
+ @JsonProperty(value = "sentencesColumn", required = false)
+ @JsonSchemaTitle("Sentences Column")
+ @JsonPropertyDescription(
+ "Column with comma-separated sentences for sentence similarity and text ranking"
+ )
+ @AutofillAttributeName
+ var sentencesColumn: EncodableString = ""
+
@JsonProperty(
value = "systemPrompt",
required = false,
@@ -138,6 +171,9 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
val byTask = scala.collection.mutable.Map.empty[String, TaskCodegen]
byTask += (TextGenCodegen.task -> TextGenCodegen)
ImageTaskCodegen.tasks.foreach(t => byTask += (t -> ImageTaskCodegen))
+ AudioTaskCodegen.tasks.foreach(t => byTask += (t -> AudioTaskCodegen))
+ MediaGenCodegen.tasks.foreach(t => byTask += (t -> MediaGenCodegen))
+ QaRankingCodegen.tasks.foreach(t => byTask += (t -> QaRankingCodegen))
byTask.toMap
}
@@ -181,6 +217,16 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
if (imageInput == null) "" else imageInput
val safeInputImageColumn: EncodableString =
if (inputImageColumn == null) "" else inputImageColumn
+ val safeAudioInput: EncodableString =
+ if (audioInput == null) "" else audioInput
+ val safeInputAudioColumn: EncodableString =
+ if (inputAudioColumn == null) "" else inputAudioColumn
+ val safeContextColumn: EncodableString =
+ if (contextColumn == null) "" else contextColumn
+ val safeCandidateLabels: EncodableString =
+ if (candidateLabels == null) "" else candidateLabels
+ val safeSentencesColumn: EncodableString =
+ if (sentencesColumn == null) "" else sentencesColumn
val ctx = CodegenContext(
hfApiToken = safeToken,
@@ -192,7 +238,12 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
safeMaxTokens = safeMaxTokens,
safeTemp = safeTemp,
imageInput = safeImageInput,
- inputImageColumn = safeInputImageColumn
+ inputImageColumn = safeInputImageColumn,
+ audioInput = safeAudioInput,
+ inputAudioColumn = safeInputAudioColumn,
+ contextColumn = safeContextColumn,
+ candidateLabels = safeCandidateLabels,
+ sentencesColumn = safeSentencesColumn
)
PythonCodegenBase.render(ctx, codegenForTask(safeTask))
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/AudioTaskCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/AudioTaskCodegen.scala
new file mode 100644
index 00000000000..560244962aa
--- /dev/null
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/AudioTaskCodegen.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace.codegen
+
+/**
+ * Codegen for Hugging Face audio task families.
+ *
+ * ASR and audio-classification send audio bytes as the raw request body.
+ * Text-to-speech is prompt-driven and sends a JSON payload; its providers
+ * return either audio bytes directly or a JSON envelope pointing to audio.
+ */
+object AudioTaskCodegen extends TaskCodegen {
+
+ override val task: String = "automatic-speech-recognition"
+
+ override val tasks: Set[String] = Set(
+ "automatic-speech-recognition",
+ "audio-classification",
+ "text-to-speech"
+ )
+
+ override def payloadPython(ctx: CodegenContext): String =
+ """ if task in audio_only_tasks:
+ | payload = current_audio_bytes
+ | use_raw_binary_body = True
+ | raw_binary_headers = audio_headers
+ | elif task == "text-to-speech":
+ | payload = {"inputs": prompt_value}""".stripMargin
+
+ override def parsePython(ctx: CodegenContext): String =
+ """ if task == "text-to-speech":
+ | if isinstance(body, dict):
+ | if "output" in body:
+ | out = body["output"]
+ | url = out[0] if isinstance(out, list) else out
+ | if isinstance(url, str) and url.startswith("http"):
+ | return self._url_to_data_url(url)
+ | if "audio" in body:
+ | audio = body["audio"]
+ | if isinstance(audio, dict):
+ | if "url" in audio:
+ | return self._url_to_data_url(audio["url"])
+ | if "b64_json" in audio:
+ | return f"data:audio/mpeg;base64,{audio['b64_json']}"
+ | if "data" in body:
+ | data = body["data"]
+ | if data and isinstance(data[0], dict):
+ | if "url" in data[0]:
+ | return self._url_to_data_url(data[0]["url"])
+ | if "b64_json" in data[0]:
+ | return f"data:audio/mpeg;base64,{data[0]['b64_json']}"
+ | return json.dumps(body)
+ | elif task == "automatic-speech-recognition":
+ | if isinstance(body, dict):
+ | if "text" in body:
+ | return body["text"]
+ | if "generated_text" in body:
+ | return body["generated_text"]
+ | return json.dumps(body)
+ | elif task == "audio-classification":
+ | return json.dumps(body)""".stripMargin
+}
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/ImageTaskCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/ImageTaskCodegen.scala
index c5c4a2669c4..5a5ee0a937e 100644
--- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/ImageTaskCodegen.scala
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/ImageTaskCodegen.scala
@@ -90,26 +90,16 @@ object ImageTaskCodegen extends TaskCodegen {
| use_raw_binary_body = True
| raw_binary_headers = image_headers
| elif task == "zero-shot-image-classification":
- | # Zero-shot requires the caller to supply candidate labels.
- | # We reuse the prompt column as a comma-separated label list so
- | # the task is shippable without a dedicated operator field.
- | # TODO: replace with a first-class `candidateLabels` field once
- | # the property panel supports task-specific inputs.
- | #
- | # Fail fast if usable labels can't be derived. Both modes lead to
- | # a meaningless inference call:
- | # 1. Empty prompt column -> labels = []
- | # The HF API rejects candidate_labels: [] with an opaque 400.
- | # 2. Missing prompt column -> upstream sets prompt_value
- | # to the fallback "What is shown in this image?", which has
- | # no comma, so labels collapses to a single nonsense entry.
- | # Zero-shot classification needs >= 2 candidate labels to be
- | # meaningful — surface a configuration error in both cases.
- | labels = [s.strip() for s in prompt_value.split(",") if s.strip()]
+ | # Prefer the dedicated candidateLabels property; fall back to
+ | # the prompt column for backward compatibility.
+ | label_source = (self.CANDIDATE_LABELS or "").strip() if self.CANDIDATE_LABELS else ""
+ | if not label_source and prompt_value:
+ | label_source = prompt_value
+ | labels = [s.strip() for s in label_source.split(",") if s.strip()]
| if len(labels) < 2:
| raise ValueError(
| "zero-shot-image-classification requires at least 2 candidate "
- | "labels: provide a comma-separated list in the prompt column."
+ | "labels: provide a comma-separated list in the Candidate Labels field."
| )
| payload = {
| "inputs": self._image_input_as_base64(current_image_bytes),
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/MediaGenCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/MediaGenCodegen.scala
new file mode 100644
index 00000000000..73047da89c3
--- /dev/null
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/MediaGenCodegen.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace.codegen
+
+/**
+ * Codegen for prompt-driven media generation tasks.
+ *
+ * Providers return media in several shapes: raw bytes, OpenAI-style
+ * b64_json, or URLs. URL responses are normalized to data URLs by the
+ * shared `_url_to_data_url` helper so downstream result rendering receives
+ * a stable string format.
+ */
+object MediaGenCodegen extends TaskCodegen {
+
+ override val task: String = "text-to-image"
+
+ override val tasks: Set[String] = Set(
+ "text-to-image",
+ "text-to-video"
+ )
+
+ override def payloadPython(ctx: CodegenContext): String =
+ """ payload = {"inputs": prompt_value}""".stripMargin
+
+ override def parsePython(ctx: CodegenContext): String =
+ """ if task == "text-to-image":
+ | if isinstance(body, dict):
+ | if "output" in body:
+ | out = body["output"]
+ | url = out[0] if isinstance(out, list) else out
+ | if isinstance(url, str) and url.startswith("http"):
+ | return self._url_to_data_url(url)
+ | if "images" in body:
+ | images = body["images"]
+ | if images and isinstance(images[0], dict) and "url" in images[0]:
+ | return self._url_to_data_url(images[0]["url"])
+ | if "data" in body:
+ | data = body["data"]
+ | if isinstance(data, dict) and "outputs" in data:
+ | outputs = data["outputs"]
+ | if outputs and isinstance(outputs[0], str) and outputs[0].startswith("http"):
+ | return self._url_to_data_url(outputs[0])
+ | if isinstance(data, list) and data and isinstance(data[0], dict):
+ | if "b64_json" in data[0]:
+ | return f"data:image/png;base64,{data[0]['b64_json']}"
+ | if "url" in data[0]:
+ | return self._url_to_data_url(data[0]["url"])
+ | return json.dumps(body)
+ | elif task == "text-to-video":
+ | if isinstance(body, dict):
+ | if "output" in body:
+ | out = body["output"]
+ | url = out[0] if isinstance(out, list) else out
+ | if isinstance(url, str) and url.startswith("http"):
+ | return self._url_to_data_url(url)
+ | if "video" in body:
+ | video = body["video"]
+ | if isinstance(video, dict) and "url" in video:
+ | return self._url_to_data_url(video["url"])
+ | return json.dumps(body)""".stripMargin
+}
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
index eac4641c62e..4184182a2e0 100644
--- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala
@@ -57,6 +57,11 @@ object PythonCodegenBase {
val temperature = ctx.safeTemp
val imageInput = ctx.imageInput
val inputImageColumn = ctx.inputImageColumn
+ val audioInput = ctx.audioInput
+ val inputAudioColumn = ctx.inputAudioColumn
+ val contextColumn = ctx.contextColumn
+ val candidateLabels = ctx.candidateLabels
+ val sentencesColumn = ctx.sentencesColumn
pyb"""import os
|import re
|import json
@@ -137,6 +142,11 @@ object PythonCodegenBase {
| self.TEMPERATURE = $temperature
| self.IMAGE_INPUT = $imageInput
| self.INPUT_IMAGE_COLUMN = $inputImageColumn
+ | self.AUDIO_INPUT = $audioInput
+ | self.INPUT_AUDIO_COLUMN = $inputAudioColumn
+ | self.CONTEXT_COLUMN = $contextColumn
+ | self.CANDIDATE_LABELS = $candidateLabels
+ | self.SENTENCES_COLUMN = $sentencesColumn
|
| def _resolve_providers(self, token):
| '''Query the HF Hub API for inference providers serving this model.
@@ -286,7 +296,14 @@ object PythonCodegenBase {
| if provider_name == "replicate":
| url = f"{base}/v1/models/{provider_id}/predictions"
| hdrs = {**json_headers, "Prefer": "wait"}
- | if task == "image-to-image" and img_b64:
+ | if task == "text-to-speech":
+ | inp = {"text": prompt_value}
+ | elif task in ("text-to-image", "text-to-video"):
+ | inp = {"prompt": prompt_value}
+ | elif task in ("automatic-speech-recognition", "audio-classification") and img_b64:
+ | audio_content_type = raw_binary_headers.get("Content-Type", "audio/mpeg")
+ | inp = {"audio": f"data:{audio_content_type};base64,{img_b64}"}
+ | elif task == "image-to-image" and img_b64:
| data_url = f"data:image/png;base64,{img_b64}"
| inp = {"image": data_url, "images": [data_url], "input_image": data_url, "prompt": prompt_value}
| elif img_b64:
@@ -340,6 +357,10 @@ object PythonCodegenBase {
| # Fal-ai: per-model endpoint.
| if provider_name == "fal-ai":
| url = f"{base}/{provider_id}"
+ | if task == "text-to-speech":
+ | return requests.post(url, headers=json_headers, json={"text": prompt_value}, timeout=120)
+ | if task in ("text-to-image", "text-to-video"):
+ | return requests.post(url, headers=json_headers, json={"prompt": prompt_value}, timeout=120)
| if task == "image-to-image" and img_b64:
| data_url = f"data:image/png;base64,{img_b64}"
| return requests.post(url, headers=json_headers, json={"image_url": data_url, "image_urls": [data_url], "prompt": prompt_value}, timeout=120)
@@ -398,6 +419,12 @@ object PythonCodegenBase {
| return poll_resp
|
| if provider_name in self.OPENAI_COMPATIBLE_PROVIDERS:
+ | if task == "text-to-image":
+ | url = f"{base}/v1/images/generations"
+ | return requests.post(url, headers=json_headers, json={"model": provider_id, "prompt": prompt_value}, timeout=120)
+ | if task == "text-to-speech":
+ | url = f"{base}/v1/audio/speech"
+ | return requests.post(url, headers=json_headers, json={"model": provider_id, "input": prompt_value}, timeout=120)
| url = f"{base}/{self.CHAT_ROUTES.get(provider_name, 'v1/chat/completions')}"
| messages = [{"role": "user", "content": prompt_value}]
| if img_b64:
@@ -444,6 +471,7 @@ object PythonCodegenBase {
| image_only_tasks = ("image-classification", "object-detection", "image-segmentation", "image-to-text")
| image_prompt_tasks = ("visual-question-answering", "document-question-answering", "zero-shot-image-classification", "image-text-to-text", "image-to-image")
| image_tasks = image_only_tasks + image_prompt_tasks
+ | audio_only_tasks = ("automatic-speech-recognition", "audio-classification")
|
| # --- validate MODEL_ID format before any HF URL is built ---
| if not _HF_MODEL_ID_PATTERN.match(self.MODEL_ID or ""):
@@ -463,12 +491,29 @@ object PythonCodegenBase {
| # --- resolve all available inference providers for this model (tried in order) ---
| providers = self._resolve_providers(token)
|
- | # --- validate prompt column exists (required for non-image tasks) ---
- | if task not in image_tasks:
+ | # --- validate prompt column exists (skipped for image tasks and binary-only audio tasks) ---
+ | if task not in image_tasks and task not in audio_only_tasks:
| assert prompt_col in table.columns, (
| f"Prompt column '{prompt_col}' not found in input table. "
| f"Available columns: {list(table.columns)}"
| )
+ | if task == "zero-shot-classification":
+ | assert self.CANDIDATE_LABELS and self.CANDIDATE_LABELS.strip(), (
+ | "Candidate Labels are required for zero-shot-classification. "
+ | "Provide a comma-separated list of labels."
+ | )
+ | if task == "question-answering":
+ | ctx_col = self.CONTEXT_COLUMN
+ | assert ctx_col and ctx_col in table.columns, (
+ | f"Context column '{ctx_col}' not found in input table. "
+ | f"Available columns: {list(table.columns)}"
+ | )
+ | if task in ("sentence-similarity", "text-ranking"):
+ | sent_col = self.SENTENCES_COLUMN
+ | assert sent_col and sent_col in table.columns, (
+ | f"Sentences column '{sent_col}' not found in input table. "
+ | f"Available columns: {list(table.columns)}"
+ | )
|
| # --- handle empty table ---
| if table.empty:
@@ -484,12 +529,29 @@ object PythonCodegenBase {
| "Authorization": f"Bearer {token}",
| "Content-Type": "application/octet-stream",
| }
+ | # --- pre-compute table dict for table-question-answering ---
+ | table_dict = None
+ | if task == "table-question-answering":
+ | table_dict = {}
+ | for col in table.columns:
+ | if col != prompt_col and col != result_col:
+ | table_dict[col] = [
+ | str(v) if not pd.isna(v) else "" for v in table[col].tolist()
+ | ]
|
| # --- resolve image source (upload or column) for image tasks ---
| has_image_upload = bool(self.IMAGE_INPUT) and bool(str(self.IMAGE_INPUT).strip())
| use_image_column = not has_image_upload and bool(self.INPUT_IMAGE_COLUMN) and self.INPUT_IMAGE_COLUMN in table.columns
| image_bytes = None
| image_error = None
+ | has_audio_upload = bool(self.AUDIO_INPUT) and bool(str(self.AUDIO_INPUT).strip())
+ | use_audio_column = not has_audio_upload and bool(self.INPUT_AUDIO_COLUMN) and self.INPUT_AUDIO_COLUMN in table.columns
+ | audio_headers = {
+ | "Authorization": f"Bearer {token}",
+ | "Content-Type": "application/octet-stream" if use_audio_column else self._get_audio_content_type(),
+ | }
+ | audio_bytes = None
+ | audio_error = None
| if task in image_tasks and not use_image_column:
| if not has_image_upload:
| image_error = "No image source. Set an Input Image Column or upload an image."
@@ -498,15 +560,28 @@ object PythonCodegenBase {
| image_bytes = self._read_image_input()
| except Exception as e:
| image_error = f"Could not read image input ({type(e).__name__}: {e})"
+ | if task in audio_only_tasks and not use_audio_column:
+ | if not has_audio_upload:
+ | audio_error = "No audio source. Set an Input Audio Column or upload audio."
+ | else:
+ | try:
+ | audio_bytes = self._read_audio_input()
+ | except Exception as e:
+ | audio_error = f"Could not read audio input ({type(e).__name__}: {e})"
|
| results = []
| for idx, row in table.iterrows():
| if image_error is not None:
| results.append(self._format_error("Image task configuration error", image_error))
| continue
+ | if audio_error is not None:
+ | results.append(self._format_error("Audio task configuration error", audio_error))
+ | continue
|
| if task in image_only_tasks:
| prompt_value = ""
+ | elif task in audio_only_tasks:
+ | prompt_value = ""
| elif task in image_prompt_tasks and prompt_col not in table.columns:
| prompt_value = "What is shown in this image?"
| else:
@@ -529,6 +604,18 @@ object PythonCodegenBase {
| results.append(self._format_error("Image data error", f"Row {idx}: {type(e).__name__}: {e}"))
| continue
|
+ | # --- resolve per-row audio bytes from column ---
+ | current_audio_bytes = audio_bytes
+ | if task in audio_only_tasks and use_audio_column:
+ | try:
+ | current_audio_bytes = self._read_binary_value(row[self.INPUT_AUDIO_COLUMN])
+ | if current_audio_bytes is None:
+ | results.append(self._format_error("Audio data error", f"Row {idx}: audio column is empty"))
+ | continue
+ | except Exception as e:
+ | results.append(self._format_error("Audio data error", f"Row {idx}: {type(e).__name__}: {e}"))
+ | continue
+ |
| # --- build task-specific payload (provided by per-task codegen) ---
| use_raw_binary_body = False
| raw_binary_headers = image_headers
@@ -576,6 +663,10 @@ object PythonCodegenBase {
| b64 = base64.b64encode(resp.content).decode("utf-8")
| results.append(f"data:{content_type};base64,{b64}")
| continue
+ | if content_type.startswith("audio/") or content_type.startswith("video/"):
+ | b64 = base64.b64encode(resp.content).decode("utf-8")
+ | results.append(f"data:{content_type};base64,{b64}")
+ | continue
|
| try:
| body = resp.json()
@@ -702,6 +793,22 @@ object PythonCodegenBase {
| def _image_input_as_base64(self, image_bytes):
| return base64.b64encode(image_bytes).decode("utf-8")
|
+ | def _read_audio_input(self):
+ | audio_input = str(self.AUDIO_INPUT or "").strip()
+ | if audio_input.startswith("data:"):
+ | _, encoded = audio_input.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if audio_input.startswith("http://") or audio_input.startswith("https://"):
+ | _, data = self._fetch_remote_url(audio_input)
+ | return data
+ | # Reading arbitrary worker-filesystem paths is intentionally NOT
+ | # supported: uploaded audio arrives as a data URL and remote audio
+ | # must be fetched through the hardened https-only helper above.
+ | raise ValueError(
+ | "Unsupported audio input. Upload an audio file (sent as a data URL) "
+ | "or provide a public https audio URL."
+ | )
+ |
| def _read_binary_value(self, value):
| if value is None:
| return None
@@ -821,6 +928,28 @@ object PythonCodegenBase {
| return text[start_pos:pos], pos
| return None, start_pos
|
+ | def _get_audio_content_type(self):
+ | audio_input = str(self.AUDIO_INPUT or "").strip().lower()
+ | if audio_input.startswith("data:"):
+ | header = audio_input.split(",", 1)[0]
+ | if ";" in header:
+ | return header[5:header.index(";")]
+ | return header[5:]
+ | extension_map = {
+ | ".mp3": "audio/mpeg",
+ | ".mpeg": "audio/mpeg",
+ | ".wav": "audio/wav",
+ | ".flac": "audio/flac",
+ | ".ogg": "audio/ogg",
+ | ".oga": "audio/ogg",
+ | ".webm": "audio/webm",
+ | ".opus": "audio/webm;codecs=opus",
+ | ".amr": "audio/amr",
+ | ".m4a": "audio/m4a",
+ | }
+ | _, ext = os.path.splitext(audio_input)
+ | return extension_map.get(ext, "audio/mpeg")
+ |
| def _url_to_data_url(self, url):
| '''Fetch a URL and return a data URL with the correct MIME type.
| Fetched via _fetch_remote_url so a malicious/compromised provider
@@ -831,12 +960,12 @@ object PythonCodegenBase {
| if not content_type or content_type == "application/octet-stream":
| from urllib.parse import urlparse as _urlparse
| ext = os.path.splitext(_urlparse(url).path.lower())[1]
- | mime_map = {".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".gif": "image/gif", ".webp": "image/webp", ".svg": "image/svg+xml", ".mp4": "video/mp4", ".webm": "video/webm"}
+ | mime_map = {".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".gif": "image/gif", ".webp": "image/webp", ".svg": "image/svg+xml", ".mp3": "audio/mpeg", ".mpeg": "audio/mpeg", ".wav": "audio/wav", ".flac": "audio/flac", ".ogg": "audio/ogg", ".oga": "audio/ogg", ".m4a": "audio/mp4", ".mp4": "video/mp4", ".webm": "video/webm"}
| guessed = mime_map.get(ext, "")
| if guessed:
| content_type = guessed
| else:
- | task_mime = {"image-to-image": "image/png"}
+ | task_mime = {"image-to-image": "image/png", "text-to-image": "image/png", "text-to-video": "video/mp4", "text-to-speech": "audio/mpeg"}
| content_type = task_mime.get(self.TASK, "application/octet-stream")
| b64 = base64.b64encode(data).decode("utf-8")
| return f"data:{content_type};base64,{b64}"
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala
new file mode 100644
index 00000000000..79572e8259f
--- /dev/null
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/QaRankingCodegen.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace.codegen
+
+/**
+ * Codegen for question-answering, zero-shot, similarity, and ranking tasks.
+ *
+ * These tasks are prompt-driven but need extra per-row or per-operator
+ * inputs: context text, candidate labels, table contents, or a list of
+ * comparison sentences/documents.
+ */
+object QaRankingCodegen extends TaskCodegen {
+
+ override val task: String = "question-answering"
+
+ override val tasks: Set[String] = Set(
+ "question-answering",
+ "table-question-answering",
+ "zero-shot-classification",
+ "sentence-similarity",
+ "text-ranking"
+ )
+
+ override def payloadPython(ctx: CodegenContext): String =
+ """ if task == "question-answering":
+ | ctx_val = row[self.CONTEXT_COLUMN]
+ | ctx_val = "" if pd.isna(ctx_val) else str(ctx_val)
+ | payload = {"inputs": {"question": prompt_value, "context": ctx_val}}
+ | elif task == "table-question-answering":
+ | payload = {"inputs": {"query": prompt_value, "table": table_dict}}
+ | elif task == "zero-shot-classification":
+ | labels = [l.strip() for l in self.CANDIDATE_LABELS.split(",") if l.strip()]
+ | payload = {
+ | "inputs": prompt_value,
+ | "parameters": {"candidate_labels": labels},
+ | }
+ | elif task in ("sentence-similarity", "text-ranking"):
+ | sent_val = row[self.SENTENCES_COLUMN]
+ | sent_val = "" if pd.isna(sent_val) else str(sent_val)
+ | sentences_list = [s.strip() for s in sent_val.split(",") if s.strip()]
+ | payload = {
+ | "inputs": {
+ | "source_sentence": prompt_value,
+ | "sentences": sentences_list,
+ | }
+ | }
+ | else:
+ | payload = {"inputs": prompt_value}""".stripMargin
+
+ override def parsePython(ctx: CodegenContext): String =
+ """ if task == "question-answering":
+ | return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body)
+ | elif task == "table-question-answering":
+ | return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body)
+ | elif task in ("zero-shot-classification", "sentence-similarity", "text-ranking"):
+ | return json.dumps(body)""".stripMargin
+}
diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
index 299ea5d6e3f..8abcef721b5 100644
--- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
+++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/TaskCodegen.scala
@@ -39,7 +39,12 @@ final case class CodegenContext(
safeMaxTokens: Int,
safeTemp: Double,
imageInput: EncodableString = "",
- inputImageColumn: EncodableString = ""
+ inputImageColumn: EncodableString = "",
+ audioInput: EncodableString = "",
+ inputAudioColumn: EncodableString = "",
+ contextColumn: EncodableString = "",
+ candidateLabels: EncodableString = "",
+ sentencesColumn: EncodableString = ""
)
/**
diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
index 0d6e09302fb..b5c27ee521b 100644
--- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
+++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDescSpec.scala
@@ -21,7 +21,13 @@ package org.apache.texera.amber.operator.huggingFace
import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
import org.apache.texera.amber.core.workflow.PortIdentity
-import org.apache.texera.amber.operator.huggingFace.codegen.{CodegenContext, TextGenCodegen}
+import org.apache.texera.amber.operator.huggingFace.codegen.{
+ AudioTaskCodegen,
+ CodegenContext,
+ MediaGenCodegen,
+ QaRankingCodegen,
+ TextGenCodegen
+}
import org.apache.texera.amber.operator.metadata.OperatorGroupConstants
import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
import org.scalatest.flatspec.AnyFlatSpec
@@ -39,7 +45,12 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers {
temperature: Double = 0.7,
resultColumn: EncodableString = "hf_response",
imageInput: EncodableString = "",
- inputImageColumn: EncodableString = ""
+ inputImageColumn: EncodableString = "",
+ audioInput: EncodableString = "",
+ inputAudioColumn: EncodableString = "",
+ contextColumn: EncodableString = "",
+ candidateLabels: EncodableString = "",
+ sentencesColumn: EncodableString = ""
): HuggingFaceInferenceOpDesc = {
val desc = new HuggingFaceInferenceOpDesc()
desc.hfApiToken = token
@@ -52,6 +63,11 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers {
desc.resultColumn = resultColumn
desc.imageInput = imageInput
desc.inputImageColumn = inputImageColumn
+ desc.audioInput = audioInput
+ desc.inputAudioColumn = inputAudioColumn
+ desc.contextColumn = contextColumn
+ desc.candidateLabels = candidateLabels
+ desc.sentencesColumn = sentencesColumn
desc
}
@@ -152,6 +168,11 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers {
desc.temperature = null
desc.imageInput = null
desc.inputImageColumn = null
+ desc.audioInput = null
+ desc.inputAudioColumn = null
+ desc.contextColumn = null
+ desc.candidateLabels = null
+ desc.sentencesColumn = null
val code = desc.generatePythonCode()
code should include("class ProcessTableOperator(UDFTableOperator):")
code should include("def open(self):")
@@ -272,10 +293,15 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers {
// size cap
code should include("MAX_REMOTE_FETCH_BYTES")
code should include("Remote file exceeds the")
- // all three fetch sites route through the helper (no raw requests.get on these URLs)
+ // all remote fetch sites route through the helper (no raw requests.get on these URLs)
code should include("_, data = self._fetch_remote_url(image_input)")
+ code should include("_, data = self._fetch_remote_url(audio_input)")
code should include("_, data = self._fetch_remote_url(val)")
code should include("raw_content_type, data = self._fetch_remote_url(url)")
+ code should not include "def _audio_url_to_data_url"
+ code should not include "requests.get(audio_input"
+ code should not include "os.path.exists(audio_input)"
+ code should not include "open(audio_input"
}
it should "treat pandas NA sentinels (NaN, pd.NA, NaT) as missing in _read_binary_value" in {
@@ -402,6 +428,143 @@ class HuggingFaceInferenceOpDescSpec extends AnyFlatSpec with Matchers {
}
}
+ "audio task family" should
+ "route ASR and audio-classification through AudioTaskCodegen as raw binary payloads" in {
+ val code =
+ makeDesc(task = "automatic-speech-recognition", inputAudioColumn = "audio")
+ .generatePythonCode()
+ code should include("self.AUDIO_INPUT = ")
+ code should include("self.INPUT_AUDIO_COLUMN = ")
+ code should include(
+ """audio_only_tasks = ("automatic-speech-recognition", "audio-classification")"""
+ )
+ code should include("payload = current_audio_bytes")
+ code should include("raw_binary_headers = audio_headers")
+ code should include("self._read_audio_input()")
+ code should include(
+ """"Content-Type": "application/octet-stream" if use_audio_column else self._get_audio_content_type()"""
+ )
+ code should include(
+ """audio_content_type = raw_binary_headers.get("Content-Type", "audio/mpeg")"""
+ )
+ code should include(
+ """elif task in ("automatic-speech-recognition", "audio-classification") and img_b64:"""
+ )
+ code should not include "data:audio/wav;base64"
+ code should include(
+ """if content_type.startswith("audio/") or content_type.startswith("video/"):"""
+ )
+ }
+
+ it should "route text-to-speech through AudioTaskCodegen and normalize audio URLs" in {
+ val code = makeDesc(task = "text-to-speech").generatePythonCode()
+ code should include("""elif task == "text-to-speech":""")
+ code should include("""payload = {"inputs": prompt_value}""")
+ code should include("self._url_to_data_url(")
+ code should include(""""text-to-speech": "audio/mpeg"""")
+ code should not include "_audio_url_to_data_url"
+ code should include("data:audio/mpeg;base64")
+ }
+
+ it should "register all audio task strings under the dispatcher" in {
+ AudioTaskCodegen.tasks should contain allOf (
+ "automatic-speech-recognition",
+ "audio-classification",
+ "text-to-speech"
+ )
+ AudioTaskCodegen.tasks.foreach { t =>
+ val code = makeDesc(task = t, inputAudioColumn = "audio").generatePythonCode()
+ code should include("if task in audio_only_tasks:")
+ }
+ }
+
+ "media generation task family" should
+ "route text-to-image through MediaGenCodegen and parse URL or b64 responses as data URLs" in {
+ val code = makeDesc(task = "text-to-image").generatePythonCode()
+ code should include("if task not in image_tasks and task not in audio_only_tasks:")
+ code should include("""payload = {"inputs": prompt_value}""")
+ code should include("""if task == "text-to-image":""")
+ code should include("self._url_to_data_url(")
+ code should include("data:image/png;base64")
+ }
+
+ it should "route text-to-video through MediaGenCodegen and normalize remote video URLs" in {
+ val code = makeDesc(task = "text-to-video").generatePythonCode()
+ code should include("""elif task == "text-to-video":""")
+ code should include("self._url_to_data_url(")
+ code should include("video/mp4")
+ }
+
+ it should "register all media generation task strings under the dispatcher" in {
+ MediaGenCodegen.tasks should contain allOf ("text-to-image", "text-to-video")
+ MediaGenCodegen.tasks.foreach { t =>
+ val code = makeDesc(task = t).generatePythonCode()
+ code should include("""payload = {"inputs": prompt_value}""")
+ }
+ }
+
+ "qa and ranking task family" should
+ "route question-answering through QaRankingCodegen with context-column validation" in {
+ val code = makeDesc(task = "question-answering", contextColumn = "context").generatePythonCode()
+ code should include("self.CONTEXT_COLUMN = ")
+ code should include("""if task == "question-answering":""")
+ code should include("ctx_col = self.CONTEXT_COLUMN")
+ code should include("Context column")
+ code should include("""payload = {"inputs": {"question": prompt_value, "context": ctx_val}}""")
+ code should include(
+ """return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body)"""
+ )
+ }
+
+ it should "route table-question-answering with a precomputed table payload" in {
+ val code = makeDesc(task = "table-question-answering").generatePythonCode()
+ code should include("""if task == "table-question-answering":""")
+ code should include("table_dict = {}")
+ code should include("""payload = {"inputs": {"query": prompt_value, "table": table_dict}}""")
+ code should include(
+ """return body.get("answer", json.dumps(body)) if isinstance(body, dict) else json.dumps(body)"""
+ )
+ }
+
+ it should "route zero-shot-classification with candidate labels" in {
+ val code =
+ makeDesc(task = "zero-shot-classification", candidateLabels = "positive,negative")
+ .generatePythonCode()
+ code should include("self.CANDIDATE_LABELS = ")
+ code should include("""if task == "zero-shot-classification":""")
+ code should include("Candidate Labels are required for zero-shot-classification.")
+ code should include("""elif task == "zero-shot-classification":""")
+ code should include("labels = [l.strip() for l in self.CANDIDATE_LABELS.split")
+ code should include(""""parameters": {"candidate_labels": labels}""")
+ }
+
+ it should "route sentence-similarity and text-ranking with sentences-column validation" in {
+ Seq("sentence-similarity", "text-ranking").foreach { taskName =>
+ val code = makeDesc(task = taskName, sentencesColumn = "sentences").generatePythonCode()
+ code should include("self.SENTENCES_COLUMN = ")
+ code should include("""elif task in ("sentence-similarity", "text-ranking"):""")
+ code should include("sent_col = self.SENTENCES_COLUMN")
+ code should include("Sentences column")
+ code should include(""""source_sentence": prompt_value""")
+ code should include(""""sentences": sentences_list""")
+ }
+ }
+
+ it should "register all qa and ranking task strings under the dispatcher" in {
+ QaRankingCodegen.tasks should contain allOf (
+ "question-answering",
+ "table-question-answering",
+ "zero-shot-classification",
+ "sentence-similarity",
+ "text-ranking"
+ )
+ QaRankingCodegen.tasks.foreach { t =>
+ val code = makeDesc(task = t, contextColumn = "context", sentencesColumn = "sentences")
+ .generatePythonCode()
+ code should include("""if task == "question-answering":""")
+ }
+ }
+
"getOutputSchemas" should "add the result column as a STRING to the inherited schema" in {
val desc = makeDesc(resultColumn = "answer")
val inputSchema = Schema().add("prompt", AttributeType.STRING)
diff --git a/frontend/src/app/app.module.ts b/frontend/src/app/app.module.ts
index 35e82f81b75..c2820725310 100644
--- a/frontend/src/app/app.module.ts
+++ b/frontend/src/app/app.module.ts
@@ -106,6 +106,8 @@ import { AgentPanelComponent } from "./workspace/component/agent/agent-panel/age
import { AgentChatComponent } from "./workspace/component/agent/agent-panel/agent-chat/agent-chat.component";
import { AgentRegistrationComponent } from "./workspace/component/agent/agent-panel/agent-registration/agent-registration.component";
import { HuggingFaceImageUploadComponent } from "./workspace/component/hugging-face-image-upload/hugging-face-image-upload.component";
+import { HuggingFaceComponent } from "./workspace/component/hugging-face/hugging-face.component";
+import { HuggingFaceAudioUploadComponent } from "./workspace/component/hugging-face-audio-upload/hugging-face-audio-upload.component";
import { DatasetFileSelectorComponent } from "./workspace/component/dataset-file-selector/dataset-file-selector.component";
import { DatasetVersionSelectorComponent } from "./workspace/component/dataset-version-selector/dataset-version-selector.component";
import { DatasetSelectionModalComponent } from "./workspace/component/dataset-selection-modal/dataset-selection-modal.component";
@@ -332,6 +334,8 @@ registerLocaleData(en);
AgentChatComponent,
AgentRegistrationComponent,
AgentInteractionComponent,
+ HuggingFaceComponent,
+ HuggingFaceAudioUploadComponent,
HuggingFaceImageUploadComponent,
DatasetFileSelectorComponent,
DatasetVersionSelectorComponent,
diff --git a/frontend/src/app/common/formly/formly-config.ts b/frontend/src/app/common/formly/formly-config.ts
index ba80dc51f96..c4fc54fd77f 100644
--- a/frontend/src/app/common/formly/formly-config.ts
+++ b/frontend/src/app/common/formly/formly-config.ts
@@ -30,6 +30,8 @@ import { FormlyRepeatDndComponent } from "./repeat-dnd/repeat-dnd.component";
import { UiUdfParametersComponent } from "../../workspace/component/ui-udf-parameters/ui-udf-parameters.component";
import { DatasetVersionSelectorComponent } from "../../workspace/component/dataset-version-selector/dataset-version-selector.component";
import { HuggingFaceImageUploadComponent } from "../../workspace/component/hugging-face-image-upload/hugging-face-image-upload.component";
+import { HuggingFaceComponent } from "../../workspace/component/hugging-face/hugging-face.component";
+import { HuggingFaceAudioUploadComponent } from "../../workspace/component/hugging-face-audio-upload/hugging-face-audio-upload.component";
/**
* Configuration for using Json Schema with Formly.
@@ -81,6 +83,8 @@ export const TEXERA_FORMLY_CONFIG = {
{ name: "codearea", component: CodeareaCustomTemplateComponent },
{ name: "inputautocomplete", component: DatasetFileSelectorComponent, wrappers: ["form-field"] },
{ name: "datasetversionselector", component: DatasetVersionSelectorComponent, wrappers: ["form-field"] },
+ { name: "huggingface", component: HuggingFaceComponent, wrappers: ["form-field"] },
+ { name: "huggingface-audio-upload", component: HuggingFaceAudioUploadComponent, wrappers: ["form-field"] },
{ name: "huggingface-image-upload", component: HuggingFaceImageUploadComponent, wrappers: ["form-field"] },
{ name: "repeat-section-dnd", component: FormlyRepeatDndComponent },
{ name: "ui-udf-parameters", component: UiUdfParametersComponent, wrappers: ["form-field"] },
diff --git a/frontend/src/app/common/util/media-type.util.spec.ts b/frontend/src/app/common/util/media-type.util.spec.ts
new file mode 100644
index 00000000000..ed81fec9f45
--- /dev/null
+++ b/frontend/src/app/common/util/media-type.util.spec.ts
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { isAudioUrl, isImageUrl, isVideoUrl } from "./media-type.util";
+
+describe("isImageUrl", () => {
+ it("should return true for data:image/ data URLs", () => {
+ expect(isImageUrl("data:image/png;base64,abc123")).toBe(true);
+ expect(isImageUrl("data:image/jpeg;base64,abc123")).toBe(true);
+ expect(isImageUrl("data:image/webp;base64,abc123")).toBe(true);
+ });
+
+ it("should return true for common image file extensions", () => {
+ expect(isImageUrl("https://example.com/photo.png")).toBe(true);
+ expect(isImageUrl("https://example.com/photo.jpg")).toBe(true);
+ expect(isImageUrl("https://example.com/photo.jpeg")).toBe(true);
+ expect(isImageUrl("https://example.com/photo.gif")).toBe(true);
+ expect(isImageUrl("https://example.com/photo.webp")).toBe(true);
+ });
+
+ it("should be case-insensitive for extensions", () => {
+ expect(isImageUrl("https://example.com/photo.PNG")).toBe(true);
+ expect(isImageUrl("https://example.com/photo.JPG")).toBe(true);
+ });
+
+ it("should return true for URLs with query strings", () => {
+ expect(isImageUrl("https://example.com/photo.png?v=1")).toBe(true);
+ });
+
+ it("should return false for audio and video URLs", () => {
+ expect(isImageUrl("data:audio/mp3;base64,abc")).toBe(false);
+ expect(isImageUrl("data:video/mp4;base64,abc")).toBe(false);
+ expect(isImageUrl("https://example.com/clip.mp4")).toBe(false);
+ });
+
+ it("should return false for plain text strings", () => {
+ expect(isImageUrl("hello world")).toBe(false);
+ expect(isImageUrl("")).toBe(false);
+ });
+});
+
+describe("isAudioUrl", () => {
+ it("should return true for data:audio/ data URLs", () => {
+ expect(isAudioUrl("data:audio/mp3;base64,abc123")).toBe(true);
+ expect(isAudioUrl("data:audio/wav;base64,abc123")).toBe(true);
+ });
+
+ it("should return true for common audio file extensions", () => {
+ expect(isAudioUrl("https://example.com/clip.mp3")).toBe(true);
+ expect(isAudioUrl("https://example.com/clip.wav")).toBe(true);
+ expect(isAudioUrl("https://example.com/clip.ogg")).toBe(true);
+ expect(isAudioUrl("https://example.com/clip.m4a")).toBe(true);
+ expect(isAudioUrl("https://example.com/clip.flac")).toBe(true);
+ });
+
+ it("should be case-insensitive for extensions", () => {
+ expect(isAudioUrl("https://example.com/clip.MP3")).toBe(true);
+ expect(isAudioUrl("https://example.com/clip.WAV")).toBe(true);
+ });
+
+ it("should return true for URLs with query strings", () => {
+ expect(isAudioUrl("https://example.com/clip.mp3?token=xyz")).toBe(true);
+ });
+
+ it("should return false for image and video URLs", () => {
+ expect(isAudioUrl("data:image/png;base64,abc")).toBe(false);
+ expect(isAudioUrl("data:video/mp4;base64,abc")).toBe(false);
+ expect(isAudioUrl("https://example.com/photo.png")).toBe(false);
+ });
+
+ it("should return false for plain text strings", () => {
+ expect(isAudioUrl("hello world")).toBe(false);
+ expect(isAudioUrl("")).toBe(false);
+ });
+});
+
+describe("isVideoUrl", () => {
+ it("should return true for data:video/ data URLs", () => {
+ expect(isVideoUrl("data:video/mp4;base64,abc123")).toBe(true);
+ expect(isVideoUrl("data:video/webm;base64,abc123")).toBe(true);
+ });
+
+ it("should return true for common video file extensions", () => {
+ expect(isVideoUrl("https://example.com/clip.mp4")).toBe(true);
+ expect(isVideoUrl("https://example.com/clip.webm")).toBe(true);
+ expect(isVideoUrl("https://example.com/clip.ogv")).toBe(true);
+ });
+
+ it("should return true for fal.media CDN URLs", () => {
+ expect(isVideoUrl("https://v3b.fal.media/files/abc123/output.mp4")).toBe(true);
+ });
+
+ it("should be case-insensitive for extensions", () => {
+ expect(isVideoUrl("https://example.com/clip.MP4")).toBe(true);
+ expect(isVideoUrl("https://example.com/clip.WEBM")).toBe(true);
+ });
+
+ it("should return true for URLs with query strings", () => {
+ expect(isVideoUrl("https://example.com/clip.mp4?t=5")).toBe(true);
+ });
+
+ it("should return false for image and audio URLs", () => {
+ expect(isVideoUrl("data:image/png;base64,abc")).toBe(false);
+ expect(isVideoUrl("data:audio/mp3;base64,abc")).toBe(false);
+ expect(isVideoUrl("https://example.com/photo.jpg")).toBe(false);
+ });
+
+ it("should return false for plain text strings", () => {
+ expect(isVideoUrl("hello world")).toBe(false);
+ expect(isVideoUrl("")).toBe(false);
+ });
+});
diff --git a/frontend/src/app/common/util/media-type.util.ts b/frontend/src/app/common/util/media-type.util.ts
new file mode 100644
index 00000000000..d60446573a8
--- /dev/null
+++ b/frontend/src/app/common/util/media-type.util.ts
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+export function isVideoUrl(value: string): boolean {
+ if (typeof value !== "string") return false;
+ return (
+ value.match(/\.(mp4|webm|ogv)(\?.*)?$/i) !== null ||
+ value.startsWith("data:video/") ||
+ value.startsWith("https://v3b.fal.media/files/")
+ );
+}
+
+export function isAudioUrl(value: string): boolean {
+ if (typeof value !== "string") return false;
+ return value.match(/\.(mp3|wav|ogg|m4a|flac)(\?.*)?$/i) !== null || value.startsWith("data:audio/");
+}
+
+export function isImageUrl(value: string): boolean {
+ if (typeof value !== "string") return false;
+ return value.match(/\.(png|jpg|jpeg|gif|webp)(\?.*)?$/i) !== null || value.startsWith("data:image/");
+}
diff --git a/frontend/src/app/workspace/component/hugging-face-audio-upload/hugging-face-audio-upload.component.html b/frontend/src/app/workspace/component/hugging-face-audio-upload/hugging-face-audio-upload.component.html
new file mode 100644
index 00000000000..507528e8d4b
--- /dev/null
+++ b/frontend/src/app/workspace/component/hugging-face-audio-upload/hugging-face-audio-upload.component.html
@@ -0,0 +1,63 @@
+
+
+
+
+ Audio files are uploaded to temporary backend storage and referenced from the operator, so larger clips can be used
+ without bloating the workflow JSON.
+