Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit e53fa67

Browse files
committed
Merge branch 'main' into shuowei-blob-strcuture-data-change
2 parents ada2445 + 543ce52 commit e53fa67

File tree

5 files changed

+160
-161
lines changed

5 files changed

+160
-161
lines changed

.librarian/state.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677
1+
image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:1a2a85ab507aea26d787c06cc7979decb117164c81dd78a745982dfda80d4f68
22
libraries:
33
- id: bigframes
44
version: 2.35.0

bigframes/bigquery/_operations/ai.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -522,10 +522,10 @@ def generate_text(
522522
model (bigframes.ml.base.BaseEstimator or str):
523523
The model to use for text generation.
524524
data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
525-
The data to generate embeddings for. If a Series is provided, it is
526-
treated as the 'content' column. If a DataFrame is provided, it
527-
must contain a 'content' column, or you must rename the column you
528-
wish to embed to 'content'.
525+
The data to generate text for. If a Series is provided, it is
526+
treated as the 'prompt' column. If a DataFrame is provided, it
527+
must contain a 'prompt' column, or you must rename the column you
528+
wish to generate text to 'prompt'.
529529
temperature (float, optional):
530530
A FLOAT64 value that is used for sampling promiscuity. The value
531531
must be in the range ``[0.0, 1.0]``. A lower temperature works well
@@ -638,10 +638,10 @@ def generate_table(
638638
model (bigframes.ml.base.BaseEstimator or str):
639639
The model to use for table generation.
640640
data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
641-
The data to generate embeddings for. If a Series is provided, it is
642-
treated as the 'content' column. If a DataFrame is provided, it
643-
must contain a 'content' column, or you must rename the column you
644-
wish to embed to 'content'.
641+
The data to generate table for. If a Series is provided, it is
642+
treated as the 'prompt' column. If a DataFrame is provided, it
643+
must contain a 'prompt' column, or you must rename the column you
644+
wish to generate table to 'prompt'.
645645
output_schema (str):
646646
A string defining the output schema (e.g., "col1 STRING, col2 INT64").
647647
temperature (float, optional):

bigframes/core/rewrite/identifiers.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,6 @@ def remap_variables(
5757
new_root = root.transform_children(lambda node: remapped_children[node])
5858

5959
# Step 3: Transform the current node using the mappings from its children.
60-
# "reversed" is required for InNode so that in case of a duplicate column ID,
61-
# the left child's mapping is the one that's kept.
62-
downstream_mappings: dict[identifiers.ColumnId, identifiers.ColumnId] = {
63-
k: v for mapping in reversed(new_child_mappings) for k, v in mapping.items()
64-
}
6560
if isinstance(new_root, nodes.InNode):
6661
new_root = typing.cast(nodes.InNode, new_root)
6762
new_root = dataclasses.replace(
@@ -71,6 +66,9 @@ def remap_variables(
7166
),
7267
)
7368
else:
69+
downstream_mappings: dict[identifiers.ColumnId, identifiers.ColumnId] = {
70+
k: v for mapping in new_child_mappings for k, v in mapping.items()
71+
}
7472
new_root = new_root.remap_refs(downstream_mappings)
7573

7674
# Step 4: Create new IDs for columns defined by the current node.
@@ -82,12 +80,8 @@ def remap_variables(
8280
new_root._validate()
8381

8482
# Step 5: Determine which mappings to propagate up to the parent.
85-
if root.defines_namespace:
86-
# If a node defines a new namespace (e.g., a join), mappings from its
87-
# children are not visible to its parents.
88-
mappings_for_parent = node_defined_mappings
89-
else:
90-
# Otherwise, pass up the combined mappings from children and the current node.
91-
mappings_for_parent = downstream_mappings | node_defined_mappings
83+
propagated_mappings = {
84+
old_id: new_id for old_id, new_id in zip(root.ids, new_root.ids)
85+
}
9286

93-
return new_root, mappings_for_parent
87+
return new_root, propagated_mappings

notebooks/multimodal/multimodal_dataframe.ipynb

Lines changed: 97 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1362,161 +1362,119 @@
13621362
"id": "iRUi8AjG7cIf"
13631363
},
13641364
"source": [
1365-
"### 5. PDF chunking function"
1365+
"### 5. PDF extraction and chunking function\n",
1366+
"\n",
1367+
"This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library."
13661368
]
13671369
},
13681370
{
13691371
"cell_type": "code",
1370-
"execution_count": 3,
1371-
"metadata": {
1372-
"id": "oDDuYtUm5Yiy"
1373-
},
1372+
"execution_count": null,
1373+
"metadata": {},
13741374
"outputs": [],
13751375
"source": [
1376-
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")"
1377-
]
1378-
},
1379-
{
1380-
"cell_type": "code",
1381-
"execution_count": 18,
1382-
"metadata": {
1383-
"colab": {
1384-
"base_uri": "https://localhost:8080/"
1385-
},
1386-
"id": "7jLpMYaj7nj8",
1387-
"outputId": "06d5456f-580f-4693-adff-2605104b056c"
1388-
},
1389-
"outputs": [
1390-
{
1391-
"name": "stderr",
1392-
"output_type": "stream",
1393-
"text": [
1394-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1395-
"instead of using `db_dtypes` in the future when available in pandas\n",
1396-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1397-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
1398-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
1399-
" return method(*args, **kwargs)\n",
1400-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1401-
"future version. Use `json_value_array` instead.\n",
1402-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n",
1403-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1404-
"future version. Use `json_value_array` instead.\n",
1405-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
1406-
]
1407-
}
1408-
],
1409-
"source": [
1410-
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
1376+
"# Construct the canonical connection ID\n",
1377+
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
1378+
"\n",
1379+
"@bpd.udf(\n",
1380+
" input_types=[str],\n",
1381+
" output_type=str,\n",
1382+
" dataset=DATASET_ID,\n",
1383+
" name=\"pdf_extract\",\n",
1384+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1385+
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
1386+
")\n",
1387+
"def pdf_extract(src_obj_ref_rt: str) -> str:\n",
1388+
" import io\n",
1389+
" import json\n",
1390+
" from pypdf import PdfReader\n",
1391+
" import requests\n",
1392+
" from requests import adapters\n",
1393+
" session = requests.Session()\n",
1394+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1395+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1396+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1397+
" response = session.get(src_url, timeout=30, stream=True)\n",
1398+
" response.raise_for_status()\n",
1399+
" pdf_bytes = response.content\n",
1400+
" pdf_file = io.BytesIO(pdf_bytes)\n",
1401+
" reader = PdfReader(pdf_file, strict=False)\n",
1402+
" all_text = \"\"\n",
1403+
" for page in reader.pages:\n",
1404+
" page_extract_text = page.extract_text()\n",
1405+
" if page_extract_text:\n",
1406+
" all_text += page_extract_text\n",
1407+
" return all_text\n",
1408+
"\n",
1409+
"@bpd.udf(\n",
1410+
" input_types=[str, int, int],\n",
1411+
" output_type=list[str],\n",
1412+
" dataset=DATASET_ID,\n",
1413+
" name=\"pdf_chunk\",\n",
1414+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1415+
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
1416+
")\n",
1417+
"def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n",
1418+
" import io\n",
1419+
" import json\n",
1420+
" from pypdf import PdfReader\n",
1421+
" import requests\n",
1422+
" from requests import adapters\n",
1423+
" session = requests.Session()\n",
1424+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1425+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1426+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1427+
" response = session.get(src_url, timeout=30, stream=True)\n",
1428+
" response.raise_for_status()\n",
1429+
" pdf_bytes = response.content\n",
1430+
" pdf_file = io.BytesIO(pdf_bytes)\n",
1431+
" reader = PdfReader(pdf_file, strict=False)\n",
1432+
" all_text_chunks = []\n",
1433+
" curr_chunk = \"\"\n",
1434+
" for page in reader.pages:\n",
1435+
" page_text = page.extract_text()\n",
1436+
" if page_text:\n",
1437+
" curr_chunk += page_text\n",
1438+
" while len(curr_chunk) >= chunk_size:\n",
1439+
" split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n",
1440+
" if split_idx == -1:\n",
1441+
" split_idx = chunk_size\n",
1442+
" actual_chunk = curr_chunk[:split_idx]\n",
1443+
" all_text_chunks.append(actual_chunk)\n",
1444+
" overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n",
1445+
" curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n",
1446+
" if curr_chunk:\n",
1447+
" all_text_chunks.append(curr_chunk)\n",
1448+
" return all_text_chunks"
14111449
]
14121450
},
14131451
{
14141452
"cell_type": "code",
1415-
"execution_count": 19,
1453+
"execution_count": null,
14161454
"metadata": {},
1417-
"outputs": [
1418-
{
1419-
"name": "stderr",
1420-
"output_type": "stream",
1421-
"text": [
1422-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1423-
"instead of using `db_dtypes` in the future when available in pandas\n",
1424-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1425-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
1426-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
1427-
" return method(*args, **kwargs)\n",
1428-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1429-
"future version. Use `json_value_array` instead.\n",
1430-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
1431-
]
1432-
},
1433-
{
1434-
"data": {
1435-
"text/html": [
1436-
"<div>\n",
1437-
"<style scoped>\n",
1438-
" .dataframe tbody tr th:only-of-type {\n",
1439-
" vertical-align: middle;\n",
1440-
" }\n",
1441-
"\n",
1442-
" .dataframe tbody tr th {\n",
1443-
" vertical-align: top;\n",
1444-
" }\n",
1445-
"\n",
1446-
" .dataframe thead th {\n",
1447-
" text-align: right;\n",
1448-
" }\n",
1449-
"</style>\n",
1450-
"<table border=\"1\" class=\"dataframe\">\n",
1451-
" <thead>\n",
1452-
" <tr style=\"text-align: right;\">\n",
1453-
" <th></th>\n",
1454-
" <th>chunked_verbose</th>\n",
1455-
" </tr>\n",
1456-
" </thead>\n",
1457-
" <tbody>\n",
1458-
" <tr>\n",
1459-
" <th>0</th>\n",
1460-
" <td>{'status': '', 'content': array([\"CritterCuisi...</td>\n",
1461-
" </tr>\n",
1462-
" </tbody>\n",
1463-
"</table>\n",
1464-
"<p>1 rows × 1 columns</p>\n",
1465-
"</div>[1 rows x 1 columns in total]"
1466-
],
1467-
"text/plain": [
1468-
" chunked_verbose\n",
1469-
"0 {'status': '', 'content': array([\"CritterCuisi...\n",
1470-
"\n",
1471-
"[1 rows x 1 columns]"
1472-
]
1473-
},
1474-
"execution_count": 19,
1475-
"metadata": {},
1476-
"output_type": "execute_result"
1477-
}
1478-
],
1455+
"outputs": [],
14791456
"source": [
1480-
"df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n",
1481-
"df_pdf[[\"chunked_verbose\"]]"
1457+
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n",
1458+
"\n",
1459+
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
1460+
"access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n",
1461+
"\n",
1462+
"# Apply PDF extraction\n",
1463+
"df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n",
1464+
"\n",
1465+
"# Apply PDF chunking\n",
1466+
"df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n",
1467+
"\n",
1468+
"df_pdf[[\"extracted_text\", \"chunked\"]]"
14821469
]
14831470
},
14841471
{
14851472
"cell_type": "code",
1486-
"execution_count": 20,
1487-
"metadata": {
1488-
"id": "kaPvJATN7zlw"
1489-
},
1490-
"outputs": [
1491-
{
1492-
"name": "stderr",
1493-
"output_type": "stream",
1494-
"text": [
1495-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1496-
"instead of using `db_dtypes` in the future when available in pandas\n",
1497-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1498-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
1499-
]
1500-
},
1501-
{
1502-
"data": {
1503-
"text/plain": [
1504-
"0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
1505-
"0 on a level, stable surface to prevent tipping....\n",
1506-
"0 included)\\nto maintain the schedule during pow...\n",
1507-
"0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
1508-
"0 paperclip) for 5\\nseconds. This will reset all...\n",
1509-
"0 unit with a damp cloth. Do not immerse the bas...\n",
1510-
"0 continues,\\ncontact customer support.\\nE2: Foo...\n",
1511-
"Name: chunked, dtype: string"
1512-
]
1513-
},
1514-
"execution_count": 20,
1515-
"metadata": {},
1516-
"output_type": "execute_result"
1517-
}
1518-
],
1473+
"execution_count": null,
1474+
"metadata": {},
1475+
"outputs": [],
15191476
"source": [
1477+
"# Explode the chunks to see each chunk as a separate row\n",
15201478
"chunked = df_pdf[\"chunked\"].explode()\n",
15211479
"chunked"
15221480
]
@@ -1719,7 +1677,7 @@
17191677
"name": "python",
17201678
"nbconvert_exporter": "python",
17211679
"pygments_lexer": "ipython3",
1722-
"version": "3.13.0"
1680+
"version": "3.10.15"
17231681
}
17241682
},
17251683
"nbformat": 4,

0 commit comments

Comments
 (0)