refactor: use direct API for audio transcription

shuoweil · shuoweil · commit 11a8b5eaf279 · 2026-02-07T06:47:31.000Z
diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb
@@ -91,7 +91,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 9,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -1451,99 +1451,112 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### 6. Audio transcribe function"
+        "### 6. Audio transcribe"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": 10,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "audio_gcs_path = \"gs://bigframes_blob_test/audio/*\"\n",
         "df = bpd.from_glob_path(audio_gcs_path, name=\"audio\")"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 11,
       "metadata": {},
       "outputs": [
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
+            "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
             "instead of using `db_dtypes` in the future when available in pandas\n",
             "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
             "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
           ]
         },
         {
           "data": {
+            "text/html": [
+              "<pre>0    Now, as all books, not primarily intended as p...</pre>"
+            ],
             "text/plain": [
               "0    Now, as all books, not primarily intended as p...\n",
               "Name: transcribed_content, dtype: string"
             ]
           },
-          "execution_count": 22,
+          "execution_count": 11,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
-        "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n",
+        "import bigframes.bigquery as bbq\n",
+        "import bigframes.operations as ops\n",
+        "\n",
+        "# The audio_transcribe function is a convenience wrapper around bigframes.bigquery.ai.generate.\n",
+        "# Here's how to perform the same operation directly:\n",
+        "\n",
+        "audio_series = df['audio']\n",
+        "prompt_text = (\n",
+        "    \"**Task:** Transcribe the provided audio. **Instructions:** - Your response \"\n",
+        "    \"must contain only the verbatim transcription of the audio. - Do not include \"\n",
+        "    \"any introductory text, summaries, or conversational filler in your response. \"\n",
+        "    \"The output should begin directly with the first word of the audio.\"\n",
+        ")\n",
+        "\n",
+        "# Convert the audio series to the runtime representation required by the model.\n",
+        "# This involves fetching metadata and getting a signed access URL.\n",
+        "audio_metadata = audio_series._apply_unary_op(ops.obj_fetch_metadata_op)\n",
+        "audio_runtime = audio_metadata._apply_unary_op(ops.ObjGetAccessUrl(mode=\"R\"))\n",
+        "\n",
+        "transcribed_results = bbq.ai.generate(\n",
+        "    prompt=(prompt_text, audio_runtime),\n",
+        "    endpoint=\"gemini-2.0-flash-001\",\n",
+        "    model_params={\"generationConfig\": {\"temperature\": 0.0}},\n",
+        ")\n",
+        "\n",
+        "transcribed_series = transcribed_results.struct.field(\"result\").rename(\"transcribed_content\")\n",
         "transcribed_series"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 23,
+      "execution_count": 12,
       "metadata": {},
       "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
-          ]
-        },
         {
           "data": {
+            "text/html": [
+              "<pre>0    {'status': '', 'content': 'Now, as all books, ...</pre>"
+            ],
             "text/plain": [
               "0    {'status': '', 'content': 'Now, as all books, ...\n",
               "Name: transcription_results, dtype: struct<status: string, content: string>[pyarrow]"
             ]
           },
-          "execution_count": 23,
+          "execution_count": 12,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
-        "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
+        "# To get verbose results (including status), we can extract both fields from the result struct.\n",
+        "transcribed_content_series = transcribed_results.struct.field(\"result\")\n",
+        "transcribed_status_series = transcribed_results.struct.field(\"status\")\n",
+        "\n",
+        "transcribed_series_verbose = bpd.DataFrame(\n",
+        "    {\n",
+        "        \"status\": transcribed_status_series,\n",
+        "        \"content\": transcribed_content_series,\n",
+        "    }\n",
+        ")\n",
+        "# Package as a struct for consistent display\n",
+        "transcribed_series_verbose = bbq.struct(transcribed_series_verbose).rename(\"transcription_results\")\n",
         "transcribed_series_verbose"
       ]
     }
@@ -1567,7 +1580,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.10.18"
+      "version": "3.13.0"
     }
   },
   "nbformat": 4,