Merge branch 'main' into shuowei-blob-strcuture-data-change

shuoweil · shuoweil · commit e53fa672243c · 2026-02-12T19:58:10.000Z
diff --git a/.librarian/state.yaml b/.librarian/state.yaml
@@ -1,4 +1,4 @@
-image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677
+image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:1a2a85ab507aea26d787c06cc7979decb117164c81dd78a745982dfda80d4f68
 libraries:
   - id: bigframes
     version: 2.35.0
diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py
@@ -522,10 +522,10 @@ def generate_text(
         model (bigframes.ml.base.BaseEstimator or str):
             The model to use for text generation.
         data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
-            The data to generate embeddings for. If a Series is provided, it is
-            treated as the 'content' column.  If a DataFrame is provided, it
-            must contain a 'content' column, or you must rename the column you
-            wish to embed to 'content'.
+            The data to generate text for. If a Series is provided, it is
+            treated as the 'prompt' column.  If a DataFrame is provided, it
+            must contain a 'prompt' column, or you must rename the column you
+            wish to generate text to 'prompt'.
         temperature (float, optional):
             A FLOAT64 value that is used for sampling promiscuity. The value
             must be in the range ``[0.0, 1.0]``. A lower temperature works well
@@ -638,10 +638,10 @@ def generate_table(
         model (bigframes.ml.base.BaseEstimator or str):
             The model to use for table generation.
         data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
-            The data to generate embeddings for. If a Series is provided, it is
-            treated as the 'content' column.  If a DataFrame is provided, it
-            must contain a 'content' column, or you must rename the column you
-            wish to embed to 'content'.
+            The data to generate table for. If a Series is provided, it is
+            treated as the 'prompt' column.  If a DataFrame is provided, it
+            must contain a 'prompt' column, or you must rename the column you
+            wish to generate table to 'prompt'.
         output_schema (str):
             A string defining the output schema (e.g., "col1 STRING, col2 INT64").
         temperature (float, optional):
diff --git a/bigframes/core/rewrite/identifiers.py b/bigframes/core/rewrite/identifiers.py
@@ -57,11 +57,6 @@ def remap_variables(
     new_root = root.transform_children(lambda node: remapped_children[node])
 
     # Step 3: Transform the current node using the mappings from its children.
-    # "reversed" is required for InNode so that in case of a duplicate column ID,
-    # the left child's mapping is the one that's kept.
-    downstream_mappings: dict[identifiers.ColumnId, identifiers.ColumnId] = {
-        k: v for mapping in reversed(new_child_mappings) for k, v in mapping.items()
-    }
     if isinstance(new_root, nodes.InNode):
         new_root = typing.cast(nodes.InNode, new_root)
         new_root = dataclasses.replace(
@@ -71,6 +66,9 @@ def remap_variables(
             ),
         )
     else:
+        downstream_mappings: dict[identifiers.ColumnId, identifiers.ColumnId] = {
+            k: v for mapping in new_child_mappings for k, v in mapping.items()
+        }
         new_root = new_root.remap_refs(downstream_mappings)
 
     # Step 4: Create new IDs for columns defined by the current node.
@@ -82,12 +80,8 @@ def remap_variables(
     new_root._validate()
 
     # Step 5: Determine which mappings to propagate up to the parent.
-    if root.defines_namespace:
-        # If a node defines a new namespace (e.g., a join), mappings from its
-        # children are not visible to its parents.
-        mappings_for_parent = node_defined_mappings
-    else:
-        # Otherwise, pass up the combined mappings from children and the current node.
-        mappings_for_parent = downstream_mappings | node_defined_mappings
+    propagated_mappings = {
+        old_id: new_id for old_id, new_id in zip(root.ids, new_root.ids)
+    }
 
-    return new_root, mappings_for_parent
+    return new_root, propagated_mappings
diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb
@@ -1362,161 +1362,119 @@
         "id": "iRUi8AjG7cIf"
       },
       "source": [
-        "### 5. PDF chunking function"
+        "### 5. PDF extraction and chunking function\n",
+        "\n",
+        "This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "oDDuYtUm5Yiy"
-      },
+      "execution_count": null,
+      "metadata": {},
       "outputs": [],
       "source": [
-        "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 18,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "7jLpMYaj7nj8",
-        "outputId": "06d5456f-580f-4693-adff-2605104b056c"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
-            "  return method(*args, **kwargs)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
-            "future version. Use `json_value_array` instead.\n",
-            "  warnings.warn(bfe.format_message(msg), category=UserWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
-            "future version. Use `json_value_array` instead.\n",
-            "  warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
-          ]
-        }
-      ],
-      "source": [
-        "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
+        "# Construct the canonical connection ID\n",
+        "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
+        "\n",
+        "@bpd.udf(\n",
+        "    input_types=[str],\n",
+        "    output_type=str,\n",
+        "    dataset=DATASET_ID,\n",
+        "    name=\"pdf_extract\",\n",
+        "    bigquery_connection=FULL_CONNECTION_ID,\n",
+        "    packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
+        ")\n",
+        "def pdf_extract(src_obj_ref_rt: str) -> str:\n",
+        "    import io\n",
+        "    import json\n",
+        "    from pypdf import PdfReader\n",
+        "    import requests\n",
+        "    from requests import adapters\n",
+        "    session = requests.Session()\n",
+        "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+        "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+        "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+        "    response = session.get(src_url, timeout=30, stream=True)\n",
+        "    response.raise_for_status()\n",
+        "    pdf_bytes = response.content\n",
+        "    pdf_file = io.BytesIO(pdf_bytes)\n",
+        "    reader = PdfReader(pdf_file, strict=False)\n",
+        "    all_text = \"\"\n",
+        "    for page in reader.pages:\n",
+        "        page_extract_text = page.extract_text()\n",
+        "        if page_extract_text:\n",
+        "            all_text += page_extract_text\n",
+        "    return all_text\n",
+        "\n",
+        "@bpd.udf(\n",
+        "    input_types=[str, int, int],\n",
+        "    output_type=list[str],\n",
+        "    dataset=DATASET_ID,\n",
+        "    name=\"pdf_chunk\",\n",
+        "    bigquery_connection=FULL_CONNECTION_ID,\n",
+        "    packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
+        ")\n",
+        "def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n",
+        "    import io\n",
+        "    import json\n",
+        "    from pypdf import PdfReader\n",
+        "    import requests\n",
+        "    from requests import adapters\n",
+        "    session = requests.Session()\n",
+        "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+        "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+        "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+        "    response = session.get(src_url, timeout=30, stream=True)\n",
+        "    response.raise_for_status()\n",
+        "    pdf_bytes = response.content\n",
+        "    pdf_file = io.BytesIO(pdf_bytes)\n",
+        "    reader = PdfReader(pdf_file, strict=False)\n",
+        "    all_text_chunks = []\n",
+        "    curr_chunk = \"\"\n",
+        "    for page in reader.pages:\n",
+        "        page_text = page.extract_text()\n",
+        "        if page_text:\n",
+        "            curr_chunk += page_text\n",
+        "            while len(curr_chunk) >= chunk_size:\n",
+        "                split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n",
+        "                if split_idx == -1:\n",
+        "                    split_idx = chunk_size\n",
+        "                actual_chunk = curr_chunk[:split_idx]\n",
+        "                all_text_chunks.append(actual_chunk)\n",
+        "                overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n",
+        "                curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n",
+        "    if curr_chunk:\n",
+        "        all_text_chunks.append(curr_chunk)\n",
+        "    return all_text_chunks"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": null,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
-            "  return method(*args, **kwargs)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
-            "future version. Use `json_value_array` instead.\n",
-            "  warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>chunked_verbose</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>{'status': '', 'content': array([\"CritterCuisi...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "<p>1 rows × 1 columns</p>\n",
-              "</div>[1 rows x 1 columns in total]"
-            ],
-            "text/plain": [
-              "                                     chunked_verbose\n",
-              "0  {'status': '', 'content': array([\"CritterCuisi...\n",
-              "\n",
-              "[1 rows x 1 columns]"
-            ]
-          },
-          "execution_count": 19,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "outputs": [],
       "source": [
-        "df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n",
-        "df_pdf[[\"chunked_verbose\"]]"
+        "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n",
+        "\n",
+        "# Generate a JSON string containing the runtime information (including signed read URLs)\n",
+        "access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n",
+        "\n",
+        "# Apply PDF extraction\n",
+        "df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n",
+        "\n",
+        "# Apply PDF chunking\n",
+        "df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n",
+        "\n",
+        "df_pdf[[\"extracted_text\", \"chunked\"]]"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 20,
-      "metadata": {
-        "id": "kaPvJATN7zlw"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/plain": [
-              "0    CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
-              "0    on a level, stable surface to prevent tipping....\n",
-              "0    included)\\nto maintain the schedule during pow...\n",
-              "0    digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
-              "0    paperclip) for 5\\nseconds. This will reset all...\n",
-              "0    unit with a damp cloth. Do not immerse the bas...\n",
-              "0    continues,\\ncontact customer support.\\nE2: Foo...\n",
-              "Name: chunked, dtype: string"
-            ]
-          },
-          "execution_count": 20,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
       "source": [
+        "# Explode the chunks to see each chunk as a separate row\n",
         "chunked = df_pdf[\"chunked\"].explode()\n",
         "chunked"
       ]
@@ -1719,7 +1677,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.13.0"
+      "version": "3.10.15"
     }
   },
   "nbformat": 4,
diff --git a/tests/unit/core/rewrite/test_identifiers.py b/tests/unit/core/rewrite/test_identifiers.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677`
	`1`	`+image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:1a2a85ab507aea26d787c06cc7979decb117164c81dd78a745982dfda80d4f68`
`2`	`2`	`libraries:`
`3`	`3`	`- id: bigframes`
`4`	`4`	`version: 2.35.0`