You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@@ -5,6 +5,17 @@ description: Use scalar UDTFs for 1:N row expansion — split videos into clips,
5
5
icon: diagram-subtask
6
6
---
7
7
8
+
import {
9
+
PyScalarUdtfIterator,
10
+
PyScalarUdtfList,
11
+
PyScalarUdtfBatch,
12
+
PyCreateScalarUdtfView,
13
+
PyAddColumnsScalarUdtf,
14
+
PyIncrementalRefresh,
15
+
PyChainingUdtfViews,
16
+
PyDocumentChunkingFull,
17
+
} from'/snippets/geneva_scalar_udtfs.mdx';
18
+
8
19
<Badge>Beta — introduced in Geneva 0.11.0</Badge>
9
20
10
21
Standard UDFs produce exactly **one output value per input row**. Scalar UDTFs enable **1:N row expansion** — each source row can produce multiple output rows. The results are stored as a materialized view with MV-style incremental refresh.
@@ -19,24 +30,11 @@ Standard UDFs produce exactly **one output value per input row**. Scalar UDTFs e
19
30
20
31
Use the `@scalar_udtf` decorator on a function that **yields** output rows. Geneva infers the output schema from the return type annotation.
For vectorized processing, use `batch=True`. The function receives Arrow arrays and returns a `RecordBatch` of expanded rows:
57
+
For vectorized processing, use `batch=True`. The function receives Arrow arrays and returns a `RecordBatch` of expanded rows. Because the return type `pa.RecordBatch` cannot be inferred, you must supply `output_schema` explicitly:
Child rows automatically include the parent's columns — no manual join required. The columns available in the child table are determined by the query's `.select()`:
@@ -131,17 +102,11 @@ The first three rows come from the `/v/a.mp4` source row, the last two from `/v/
131
102
132
103
Since scalar UDTF views are materialized views, you can add UDF-computed columns to the child table and backfill them:
exportconst PyCreateScalarUdtfView ="import geneva\n\ndb = geneva.connect(\"/data/mydb\")\nvideos = db.open_table(\"videos\")\n\n# Create the 1:N materialized view\nclips = db.create_scalar_udtf_view(\n\"clips\",\n source=videos.search(None).select([\"video_path\", \"metadata\"]),\n scalar_udtf=extract_clips,\n)\n\n# Populate — runs the UDTF on every source row\nclips.refresh()\n";
8
+
9
+
export const PyDocumentChunkingFull = "from geneva import connect, scalar_udtf, udf\nfrom typing import Iterator, NamedTuple\nimport pyarrow as pa\n\nclass Chunk(NamedTuple):\n chunk_index: int\n chunk_text: str\n\n@scalar_udtf\ndef chunk_document(text: str) -> Iterator[Chunk]:\n \"\"\"Split a document into overlapping chunks.\"\"\"\n words = text.split()\n chunk_size = 500\n overlap = 50\n for i, start in enumerate(range(0, len(words), chunk_size - overlap)):\n chunk_words = words[start:start + chunk_size]\n yield Chunk(chunk_index=i, chunk_text=\" \".join(chunk_words))\n\ndb = connect(\"/data/mydb\")\ndocs = db.open_table(\"documents\")\n\n# Create chunked view — inherits doc_id, title, etc. from source\nchunks = db.create_scalar_udtf_view(\n \"doc_chunks\",\n source=docs.search(None).select([\"doc_id\", \"title\", \"text\"]),\n scalar_udtf=chunk_document,\n)\nchunks.refresh()\n\n# Add embeddings to chunks for semantic search\n@udf(data_type=pa.list_(pa.float32(), 1536))\ndef embed_text(chunk_text: str) -> list[float]:\n return embedding_model.encode(chunk_text)\n\nchunks.add_columns({\"embedding\": embed_text})\nchunks.backfill(\"embedding\") # Backfills embeddings on all existing chunks\n\n# Query — parent columns available alongside chunk columns\nchunks.search(None).select([\"doc_id\", \"title\", \"chunk_text\", \"embedding\"]).to_pandas()\n";
10
+
11
+
exportconst PyDocumentChunkingUdtf ="from geneva import scalar_udtf\nfrom typing import Iterator, NamedTuple\n\nclass Chunk(NamedTuple):\n chunk_index: int\n chunk_text: str\n\n@scalar_udtf\ndef chunk_document(text: str) -> Iterator[Chunk]:\n\"\"\"Split a document into overlapping chunks.\"\"\"\n words = text.split()\n chunk_size = 500\n overlap = 50\n for i, start in enumerate(range(0, len(words), chunk_size - overlap)):\n chunk_words = words[start:start + chunk_size]\n yield Chunk(chunk_index=i, chunk_text=\"\".join(chunk_words))\n";
12
+
13
+
exportconst PyIncrementalRefresh ="# Add new videos to the source table\nvideos.add(new_video_data)\n\n# Incremental refresh — only processes the new videos\nclips.refresh()\n";
14
+
15
+
exportconst PyScalarUdtfBatch ="@scalar_udtf(batch=True, output_schema=clip_schema)\ndef extract_clips(batch: pa.RecordBatch) -> pa.RecordBatch:\n\"\"\"Process rows in batches. Same 1:N semantic per row.\"\"\"\n ...\n";
exportconst PyAddColumnsCalculated ="# Add a discounted price column (10% discount)\ntable.add_columns({\"discounted_price\": \"cast((price * 0.9) as float)\"})\n";
4
6
5
7
exportconst PyAddColumnsDefaultValues ="# Add a stock status column with default value\ntable.add_columns({\"in_stock\": \"cast(true as boolean)\"})\n";
exportconst PyVersioningUpdateData ="table.update(where=\"author='Richard'\", values={\"author\": \"Richard Daniel Sanchez\"})\nrows_after_update = table.count_rows(\"author = 'Richard Daniel Sanchez'\")\nprint(f\"Rows updated to Richard Daniel Sanchez: {rows_after_update}\")\n";
exportconst TsAddColumnsDefaultValues ="// Add a stock status column with default value\nawait schemaAddTable.addColumns([\n {\n name: \"in_stock\",\n valueSql: \"cast(true as boolean)\",\n },\n]);\n";
0 commit comments