apache · juliethecao · May 29, 2026 · Jun 15, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala b/amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala
@@ -35,7 +35,7 @@ import java.util.Optional
 object UserAuthenticator extends Authenticator[JwtContext, SessionUser] with LazyLogging {
   override def authenticate(context: JwtContext): Optional[SessionUser] = {
     try {
-      Optional.of(JwtParser.claimsToSessionUser(context.getJwtClaims))
+      JwtParser.claimsToOptionalSessionUser(context.getJwtClaims)
     } catch {
       case e: Exception =>
         logger.error("Failed to authenticate the JwtContext", e)

diff --git a/common/auth/src/main/scala/org/apache/texera/auth/JwtParser.scala b/common/auth/src/main/scala/org/apache/texera/auth/JwtParser.scala
@@ -38,7 +38,7 @@ object JwtParser extends LazyLogging {
   /** Verify and parse a Bearer token string. */
   def parseToken(token: String): Optional[SessionUser] = {
     try {
-      Optional.of(claimsToSessionUser(JwtAuth.jwtConsumer.processToClaims(token)))
+      claimsToOptionalSessionUser(JwtAuth.jwtConsumer.processToClaims(token))
     } catch {
       case _: UnresolvableKeyException =>
         logger.error("Invalid JWT Signature")
@@ -49,6 +49,19 @@ object JwtParser extends LazyLogging {
     }
   }
 
+  /** Convert already-verified claims to a [[SessionUser]], returning empty when
+    * the required Texera custom claims are missing or malformed.
+    */
+  def claimsToOptionalSessionUser(claims: JwtClaims): Optional[SessionUser] = {
+    try {
+      Optional.of(claimsToSessionUser(claims))
+    } catch {
+      case e: IllegalArgumentException =>
+        logger.error(s"Invalid JWT claims: ${e.getMessage}")
+        Optional.empty()
+    }
+  }
+
   /** Build a [[SessionUser]] from already-verified claims. Used by both
     * [[parseToken]] (which verifies then calls this) and amber's
     * `UserAuthenticator` (which the toastshaman filter calls after its own
@@ -59,8 +72,12 @@ object JwtParser extends LazyLogging {
     val email = claims.getClaimValue("email", classOf[String])
     // jose4j returns Long after JSON round-trip but the original setClaim
     // call writes Integer; widen via Number to handle both cases.
-    val userId = claims.getClaimValue("userId", classOf[Number]).intValue()
-    val role = UserRoleEnum.valueOf(claims.getClaimValue("role").asInstanceOf[String])
+    val userId = Option(claims.getClaimValue("userId", classOf[Number]))
+      .map(_.intValue())
+      .getOrElse(throw new IllegalArgumentException("JWT claim 'userId' is required."))
+    val roleName = Option(claims.getClaimValue("role", classOf[String]))
+      .getOrElse(throw new IllegalArgumentException("JWT claim 'role' is required."))
+    val role = UserRoleEnum.valueOf(roleName)
     val googleId = claims.getClaimValue("googleId", classOf[String])
     val googleAvatar = claims.getClaimValue("googleAvatar", classOf[String])
     val user = new User(

diff --git a/common/auth/src/test/scala/org/apache/texera/auth/JwtParserSpec.scala b/common/auth/src/test/scala/org/apache/texera/auth/JwtParserSpec.scala
@@ -74,6 +74,18 @@ class JwtParserSpec extends AnyFlatSpec with Matchers {
     u.getGoogleAvatar shouldBe "avatar-blob"
   }
 
+  it should "return empty when already-verified claims are missing userId" in {
+    val claims = buildClaims()
+    claims.unsetClaim("userId")
+    JwtParser.claimsToOptionalSessionUser(claims).isPresent shouldBe false
+  }
+
+  it should "return empty when already-verified claims are missing role" in {
+    val claims = buildClaims()
+    claims.unsetClaim("role")
+    JwtParser.claimsToOptionalSessionUser(claims).isPresent shouldBe false
+  }
+
   "JwtParser.parseToken" should "return empty on a structurally invalid token" in {
     JwtParser.parseToken("not-a-real-jwt").isPresent shouldBe false
   }

diff --git a/.../main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala b/.../main/scala/org/apache/texera/amber/operator/huggingFace/HuggingFaceInferenceOpDesc.scala
@@ -25,9 +25,12 @@ import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
 import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PortIdentity}
 import org.apache.texera.amber.operator.PythonOperatorDescriptor
 import org.apache.texera.amber.operator.huggingFace.codegen.{
+  AudioTaskCodegen,
   CodegenContext,
   ImageTaskCodegen,
+  MediaGenCodegen,
   PythonCodegenBase,
+  QaRankingCodegen,
   TaskCodegen,
   TextGenCodegen
 }
@@ -95,6 +98,36 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
   @AutofillAttributeName
   var inputImageColumn: EncodableString = ""
 
+  @JsonProperty(value = "audioInput", required = false)
+  @JsonSchemaTitle("Audio Upload")
+  @JsonPropertyDescription("Upload audio for Hugging Face audio tasks")
+  var audioInput: EncodableString = ""
+
+  @JsonProperty(value = "inputAudioColumn", required = false)
+  @JsonSchemaTitle("Input Audio Column")
+  @JsonPropertyDescription("Column containing audio data from the input table")
+  @AutofillAttributeName
+  var inputAudioColumn: EncodableString = ""
+
+  @JsonProperty(value = "contextColumn", required = false)
+  @JsonSchemaTitle("Context Column")
+  @JsonPropertyDescription("Column containing the context passage for question answering")
+  @AutofillAttributeName
+  var contextColumn: EncodableString = ""
+
+  @JsonProperty(value = "candidateLabels", required = false)
+  @JsonSchemaTitle("Candidate Labels")
+  @JsonPropertyDescription("Comma-separated candidate labels for zero-shot classification")
+  var candidateLabels: EncodableString = ""
+
+  @JsonProperty(value = "sentencesColumn", required = false)
+  @JsonSchemaTitle("Sentences Column")
+  @JsonPropertyDescription(
+    "Column with comma-separated sentences for sentence similarity and text ranking"
+  )
+  @AutofillAttributeName
+  var sentencesColumn: EncodableString = ""
+
   @JsonProperty(
     value = "systemPrompt",
     required = false,
@@ -138,6 +171,9 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
     val byTask = scala.collection.mutable.Map.empty[String, TaskCodegen]
     byTask += (TextGenCodegen.task -> TextGenCodegen)
     ImageTaskCodegen.tasks.foreach(t => byTask += (t -> ImageTaskCodegen))
+    AudioTaskCodegen.tasks.foreach(t => byTask += (t -> AudioTaskCodegen))
+    MediaGenCodegen.tasks.foreach(t => byTask += (t -> MediaGenCodegen))
+    QaRankingCodegen.tasks.foreach(t => byTask += (t -> QaRankingCodegen))
     byTask.toMap
   }
 
@@ -181,6 +217,16 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
       if (imageInput == null) "" else imageInput
     val safeInputImageColumn: EncodableString =
       if (inputImageColumn == null) "" else inputImageColumn
+    val safeAudioInput: EncodableString =
+      if (audioInput == null) "" else audioInput
+    val safeInputAudioColumn: EncodableString =
+      if (inputAudioColumn == null) "" else inputAudioColumn
+    val safeContextColumn: EncodableString =
+      if (contextColumn == null) "" else contextColumn
+    val safeCandidateLabels: EncodableString =
+      if (candidateLabels == null) "" else candidateLabels
+    val safeSentencesColumn: EncodableString =
+      if (sentencesColumn == null) "" else sentencesColumn
 
     val ctx = CodegenContext(
       hfApiToken = safeToken,
@@ -192,7 +238,12 @@ class HuggingFaceInferenceOpDesc extends PythonOperatorDescriptor {
       safeMaxTokens = safeMaxTokens,
       safeTemp = safeTemp,
       imageInput = safeImageInput,
-      inputImageColumn = safeInputImageColumn
+      inputImageColumn = safeInputImageColumn,
+      audioInput = safeAudioInput,
+      inputAudioColumn = safeInputAudioColumn,
+      contextColumn = safeContextColumn,
+      candidateLabels = safeCandidateLabels,
+      sentencesColumn = safeSentencesColumn
     )
 
     PythonCodegenBase.render(ctx, codegenForTask(safeTask))

diff --git a/...rc/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/AudioTaskCodegen.scala b/...rc/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/AudioTaskCodegen.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace.codegen
+
+/**
+  * Codegen for Hugging Face audio task families.
+  *
+  * ASR and audio-classification send audio bytes as the raw request body.
+  * Text-to-speech is prompt-driven and sends a JSON payload; its providers
+  * return either audio bytes directly or a JSON envelope pointing to audio.
+  */
+object AudioTaskCodegen extends TaskCodegen {
+
+  override val task: String = "automatic-speech-recognition"
+
+  override val tasks: Set[String] = Set(
+    "automatic-speech-recognition",
+    "audio-classification",
+    "text-to-speech"
+  )
+
+  override def payloadPython(ctx: CodegenContext): String =
+    """            if task in audio_only_tasks:
+      |                payload = current_audio_bytes
+      |                use_raw_binary_body = True
+      |                raw_binary_headers = audio_headers
+      |            elif task == "text-to-speech":
+      |                payload = {"inputs": prompt_value}""".stripMargin
+
+  override def parsePython(ctx: CodegenContext): String =
+    """            if task == "text-to-speech":
+      |                if isinstance(body, dict):
+      |                    if "output" in body:
+      |                        out = body["output"]
+      |                        url = out[0] if isinstance(out, list) else out
+      |                        if isinstance(url, str) and url.startswith("http"):
+      |                            return self._url_to_data_url(url)
+      |                    if "audio" in body:
+      |                        audio = body["audio"]
+      |                        if isinstance(audio, dict):
+      |                            if "url" in audio:
+      |                                return self._url_to_data_url(audio["url"])
+      |                            if "b64_json" in audio:
+      |                                return f"data:audio/mpeg;base64,{audio['b64_json']}"
+      |                    if "data" in body:
+      |                        data = body["data"]
+      |                        if data and isinstance(data[0], dict):
+      |                            if "url" in data[0]:
+      |                                return self._url_to_data_url(data[0]["url"])
+      |                            if "b64_json" in data[0]:
+      |                                return f"data:audio/mpeg;base64,{data[0]['b64_json']}"
+      |                return json.dumps(body)
+      |            elif task == "automatic-speech-recognition":
+      |                if isinstance(body, dict):
+      |                    if "text" in body:
+      |                        return body["text"]
+      |                    if "generated_text" in body:
+      |                        return body["generated_text"]
+      |                return json.dumps(body)
+      |            elif task == "audio-classification":
+      |                return json.dumps(body)""".stripMargin
+}
diff --git a/...rc/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/ImageTaskCodegen.scala b/...rc/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/ImageTaskCodegen.scala
@@ -90,26 +90,16 @@ object ImageTaskCodegen extends TaskCodegen {
       |                use_raw_binary_body = True
       |                raw_binary_headers = image_headers
       |            elif task == "zero-shot-image-classification":
-      |                # Zero-shot requires the caller to supply candidate labels.
-      |                # We reuse the prompt column as a comma-separated label list so
-      |                # the task is shippable without a dedicated operator field.
-      |                # TODO: replace with a first-class `candidateLabels` field once
-      |                # the property panel supports task-specific inputs.
-      |                #
-      |                # Fail fast if usable labels can't be derived. Both modes lead to
-      |                # a meaningless inference call:
-      |                #   1. Empty prompt column          -> labels = []
-      |                #      The HF API rejects candidate_labels: [] with an opaque 400.
-      |                #   2. Missing prompt column        -> upstream sets prompt_value
-      |                #      to the fallback "What is shown in this image?", which has
-      |                #      no comma, so labels collapses to a single nonsense entry.
-      |                # Zero-shot classification needs >= 2 candidate labels to be
-      |                # meaningful — surface a configuration error in both cases.
-      |                labels = [s.strip() for s in prompt_value.split(",") if s.strip()]
+      |                # Prefer the dedicated candidateLabels property; fall back to
+      |                # the prompt column for backward compatibility.
+      |                label_source = (self.CANDIDATE_LABELS or "").strip() if self.CANDIDATE_LABELS else ""
+      |                if not label_source and prompt_value:
+      |                    label_source = prompt_value
+      |                labels = [s.strip() for s in label_source.split(",") if s.strip()]
       |                if len(labels) < 2:
       |                    raise ValueError(
       |                        "zero-shot-image-classification requires at least 2 candidate "
-      |                        "labels: provide a comma-separated list in the prompt column."
+      |                        "labels: provide a comma-separated list in the Candidate Labels field."
       |                    )
       |                payload = {
       |                    "inputs": self._image_input_as_base64(current_image_bytes),

diff --git a/...src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/MediaGenCodegen.scala b/...src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/MediaGenCodegen.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.huggingFace.codegen
+
+/**
+  * Codegen for prompt-driven media generation tasks.
+  *
+  * Providers return media in several shapes: raw bytes, OpenAI-style
+  * b64_json, or URLs. URL responses are normalized to data URLs by the
+  * shared `_url_to_data_url` helper so downstream result rendering receives
+  * a stable string format.
+  */
+object MediaGenCodegen extends TaskCodegen {
+
+  override val task: String = "text-to-image"
+
+  override val tasks: Set[String] = Set(
+    "text-to-image",
+    "text-to-video"
+  )
+
+  override def payloadPython(ctx: CodegenContext): String =
+    """            payload = {"inputs": prompt_value}""".stripMargin
+
+  override def parsePython(ctx: CodegenContext): String =
+    """            if task == "text-to-image":
+      |                if isinstance(body, dict):
+      |                    if "output" in body:
+      |                        out = body["output"]
+      |                        url = out[0] if isinstance(out, list) else out
+      |                        if isinstance(url, str) and url.startswith("http"):
+      |                            return self._url_to_data_url(url)
+      |                    if "images" in body:
+      |                        images = body["images"]
+      |                        if images and isinstance(images[0], dict) and "url" in images[0]:
+      |                            return self._url_to_data_url(images[0]["url"])
+      |                    if "data" in body:
+      |                        data = body["data"]
+      |                        if isinstance(data, dict) and "outputs" in data:
+      |                            outputs = data["outputs"]
+      |                            if outputs and isinstance(outputs[0], str) and outputs[0].startswith("http"):
+      |                                return self._url_to_data_url(outputs[0])
+      |                        if isinstance(data, list) and data and isinstance(data[0], dict):
+      |                            if "b64_json" in data[0]:
+      |                                return f"data:image/png;base64,{data[0]['b64_json']}"
+      |                            if "url" in data[0]:
+      |                                return self._url_to_data_url(data[0]["url"])
+      |                return json.dumps(body)
+      |            elif task == "text-to-video":
+      |                if isinstance(body, dict):
+      |                    if "output" in body:
+      |                        out = body["output"]
+      |                        url = out[0] if isinstance(out, list) else out
+      |                        if isinstance(url, str) and url.startswith("http"):
+      |                            return self._url_to_data_url(url)
+      |                    if "video" in body:
+      |                        video = body["video"]
+      |                        if isinstance(video, dict) and "url" in video:
+      |                            return self._url_to_data_url(video["url"])
+      |                return json.dumps(body)""".stripMargin
+}