kosiew
diff --git a/‎Cargo.lock‎
Lines changed: 258 additions & 276 deletions b/‎Cargo.lock‎
Lines changed: 258 additions & 276 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 5 additions & 5 deletions b/‎Cargo.toml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 0 deletions b/‎README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎benchmarks/max_cpu_usage.py‎
Lines changed: 107 additions & 0 deletions b/‎benchmarks/max_cpu_usage.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎dev/changelog/49.0.0.md‎
Lines changed: 61 additions & 0 deletions b/‎dev/changelog/49.0.0.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎docs/source/user-guide/configuration.rst‎
Lines changed: 136 additions & 1 deletion b/‎docs/source/user-guide/configuration.rst‎
Lines changed: 136 additions & 1 deletion
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 1 addition & 22 deletions b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 1 addition & 22 deletions
@@ -17,7 +17,7 @@
 
 [package]
 name = "datafusion-python"
-version = "49.0.1"
+version = "49.0.0"
 homepage = "https://datafusion.apache.org/python"
 repository = "https://github.com/apache/datafusion-python"
 authors = ["Apache DataFusion <dev@datafusion.apache.org>"]
@@ -39,10 +39,10 @@ pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"]
 pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
 pyo3-log = "0.12.4"
 arrow = { version = "55.1.0", features = ["pyarrow"] }
-datafusion = { version = "49.0.1", features = ["avro", "unicode_expressions"] }
-datafusion-substrait = { version = "49.0.1", optional = true }
-datafusion-proto = { version = "49.0.1" }
-datafusion-ffi = { version = "49.0.1" }
+datafusion = { version = "49.0.2", features = ["avro", "unicode_expressions"] }
+datafusion-substrait = { version = "49.0.2", optional = true }
+datafusion-proto = { version = "49.0.2" }
+datafusion-ffi = { version = "49.0.2" }
 prost = "0.13.1" # keep in line with `datafusion-substrait`
 uuid = { version = "1.18", features = ["v4"] }
 mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
 
@@ -42,6 +42,10 @@ DataFusion's Python bindings can be used as a foundation for building new data s
 - Serialize and deserialize query plans in Substrait format.
 - Experimental support for transpiling SQL queries to DataFrame calls with Polars, Pandas, and cuDF.
 
+For tips on tuning parallelism, see
+[Maximizing CPU Usage](docs/source/user-guide/configuration.rst#maximizing-cpu-usage)
+in the configuration guide.
+
 ## Example Usage
 
 The following example demonstrates running a SQL query against a Parquet file using DataFusion, storing the results
@@ -227,6 +231,8 @@ and for `uv run` commands the additional parameter `--no-project`
 ```bash
 # fetch this repo
 git clone git@github.com:apache/datafusion-python.git
+# cd to the repo root
+cd datafusion-python/
 # create the virtual enviornment
 uv sync --dev --no-install-package datafusion
 # activate the environment
@@ -238,6 +244,8 @@ Bootstrap (`pip`):
 ```bash
 # fetch this repo
 git clone git@github.com:apache/datafusion-python.git
+# cd to the repo root
+cd datafusion-python/
 # prepare development environment (used to build wheel / install in development)
 python3 -m venv .venv
 # activate the venv
 
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Benchmark script showing how to maximize CPU usage.
+
+This script demonstrates one example of tuning DataFusion for improved parallelism
+and CPU utilization. It uses synthetic in-memory data and performs simple aggregation
+operations to showcase the impact of partitioning configuration.
+
+IMPORTANT: This is a simplified example designed to illustrate partitioning concepts.
+Actual performance in your applications may vary significantly based on many factors:
+
+- Type of table providers (Parquet files, CSV, databases, etc.)
+- I/O operations and storage characteristics (local disk, network, cloud storage)
+- Query complexity and operation types (joins, window functions, complex expressions)
+- Data distribution and size characteristics
+- Memory available and hardware specifications
+- Network latency for distributed data sources
+
+It is strongly recommended that you create similar benchmarks tailored to your specific:
+- Hardware configuration
+- Data sources and formats
+- Typical query patterns and workloads
+- Performance requirements
+
+This will give you more accurate insights into how DataFusion configuration options
+will affect your particular use case.
+"""
+
+from __future__ import annotations
+
+import argparse
+import multiprocessing
+import time
+
+import pyarrow as pa
+from datafusion import SessionConfig, SessionContext, col
+from datafusion import functions as f
+
+
+def main(num_rows: int, partitions: int) -> None:
+    """Run a simple aggregation after repartitioning.
+    
+    This function demonstrates basic partitioning concepts using synthetic data.
+    Real-world performance will depend on your specific data sources, query types,
+    and system configuration.
+    """
+    # Create some example data (synthetic in-memory data for demonstration)
+    # Note: Real applications typically work with files, databases, or other
+    # data sources that have different I/O and distribution characteristics
+    array = pa.array(range(num_rows))
+    batch = pa.record_batch([array], names=["a"])
+
+    # Configure the session to use a higher target partition count and
+    # enable automatic repartitioning.
+    config = (
+        SessionConfig()
+        .with_target_partitions(partitions)
+        .with_repartition_joins(enabled=True)
+        .with_repartition_aggregations(enabled=True)
+        .with_repartition_windows(enabled=True)
+    )
+    ctx = SessionContext(config)
+
+    # Register the input data and repartition manually to ensure that all
+    # partitions are used.
+    df = ctx.create_dataframe([[batch]]).repartition(partitions)
+
+    start = time.time()
+    df = df.aggregate([], [f.sum(col("a"))])
+    df.collect()
+    end = time.time()
+
+    print(
+        f"Processed {num_rows} rows using {partitions} partitions in {end - start:.3f}s"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--rows",
+        type=int,
+        default=1_000_000,
+        help="Number of rows in the generated dataset",
+    )
+    parser.add_argument(
+        "--partitions",
+        type=int,
+        default=multiprocessing.cpu_count(),
+        help="Target number of partitions to use",
+    )
+    args = parser.parse_args()
+    main(args.rows, args.partitions)
@@ -0,0 +1,61 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion Python 49.0.0 Changelog
+
+This release consists of 16 commits from 7 contributors. See credits at the end of this changelog for more information.
+
+**Fixed bugs:**
+
+- fix(build): Include build.rs in published crates [#1199](https://github.com/apache/datafusion-python/pull/1199) (colinmarc)
+
+**Other:**
+
+- 48.0.0 Release [#1175](https://github.com/apache/datafusion-python/pull/1175) (timsaucer)
+- Update CI rules [#1188](https://github.com/apache/datafusion-python/pull/1188) (timsaucer)
+- Fix Python UDAF Accumulator Interface example to Properly Handle State and Updates with List[Array] Types [#1192](https://github.com/apache/datafusion-python/pull/1192) (kosiew)
+- chore: Upgrade datafusion to version 49 [#1200](https://github.com/apache/datafusion-python/pull/1200) (nuno-faria)
+- Update how to dev instructions [#1179](https://github.com/apache/datafusion-python/pull/1179) (ntjohnson1)
+- build(deps): bump object_store from 0.12.2 to 0.12.3 [#1189](https://github.com/apache/datafusion-python/pull/1189) (dependabot[bot])
+- build(deps): bump uuid from 1.17.0 to 1.18.0 [#1202](https://github.com/apache/datafusion-python/pull/1202) (dependabot[bot])
+- build(deps): bump async-trait from 0.1.88 to 0.1.89 [#1203](https://github.com/apache/datafusion-python/pull/1203) (dependabot[bot])
+- build(deps): bump slab from 0.4.10 to 0.4.11 [#1205](https://github.com/apache/datafusion-python/pull/1205) (dependabot[bot])
+- Improved window and aggregate function signature [#1187](https://github.com/apache/datafusion-python/pull/1187) (timsaucer)
+- Optional improvements in verification instructions [#1183](https://github.com/apache/datafusion-python/pull/1183) (paleolimbot)
+- Improve `show()` output for empty DataFrames [#1208](https://github.com/apache/datafusion-python/pull/1208) (kosiew)
+- build(deps): bump actions/download-artifact from 4 to 5 [#1201](https://github.com/apache/datafusion-python/pull/1201) (dependabot[bot])
+- build(deps): bump url from 2.5.4 to 2.5.7 [#1210](https://github.com/apache/datafusion-python/pull/1210) (dependabot[bot])
+- build(deps): bump actions/checkout from 4 to 5 [#1204](https://github.com/apache/datafusion-python/pull/1204) (dependabot[bot])
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     7	dependabot[bot]
+     3	Tim Saucer
+     2	kosiew
+     1	Colin Marc
+     1	Dewey Dunnington
+     1	Nick
+     1	Nuno Faria
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
+
@@ -46,6 +46,141 @@ a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.conte
     ctx = SessionContext(config, runtime)
     print(ctx)
 
+Maximizing CPU Usage
+--------------------
 
-You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide <https://arrow.apache.org/datafusion/user-guide/configs.html>`_,
+DataFusion uses partitions to parallelize work. For small queries the
+default configuration (number of CPU cores) is often sufficient, but to
+fully utilize available hardware you can tune how many partitions are
+created and when DataFusion will repartition data automatically.
+
+Configure a ``SessionContext`` with a higher partition count:
+
+.. code-block:: python
+
+    from datafusion import SessionConfig, SessionContext
+
+    # allow up to 16 concurrent partitions
+    config = SessionConfig().with_target_partitions(16)
+    ctx = SessionContext(config)
+
+Automatic repartitioning for joins, aggregations, window functions and
+other operations can be enabled to increase parallelism:
+
+.. code-block:: python
+
+    config = (
+        SessionConfig()
+        .with_target_partitions(16)
+        .with_repartition_joins(True)
+        .with_repartition_aggregations(True)
+        .with_repartition_windows(True)
+    )
+
+Manual repartitioning is available on DataFrames when you need precise
+control:
+
+.. code-block:: python
+
+    from datafusion import col
+
+    df = ctx.read_parquet("data.parquet")
+
+    # Evenly divide into 16 partitions
+    df = df.repartition(16)
+
+    # Or partition by the hash of a column
+    df = df.repartition_by_hash(col("a"), num=16)
+
+    result = df.collect()
+
+
+Benchmark Example
+^^^^^^^^^^^^^^^^^
+
+The repository includes a benchmark script that demonstrates how to maximize CPU usage
+with DataFusion. The :code:`benchmarks/max_cpu_usage.py` script shows a practical example
+of configuring DataFusion for optimal parallelism.
+
+You can run the benchmark script to see the impact of different configuration settings:
+
+.. code-block:: bash
+
+    # Run with default settings (uses all CPU cores)
+    python benchmarks/max_cpu_usage.py
+
+    # Run with specific number of rows and partitions
+    python benchmarks/max_cpu_usage.py --rows 5000000 --partitions 16
+
+    # See all available options
+    python benchmarks/max_cpu_usage.py --help
+
+Here's an example showing the performance difference between single and multiple partitions:
+
+.. code-block:: bash
+
+    # Single partition - slower processing
+    $ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 1
+    Processed 10000000 rows using 1 partitions in 0.107s
+
+    # Multiple partitions - faster processing
+    $ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 10
+    Processed 10000000 rows using 10 partitions in 0.038s
+
+This example demonstrates nearly 3x performance improvement (0.107s vs 0.038s) when using 
+10 partitions instead of 1, showcasing how proper partitioning can significantly improve 
+CPU utilization and query performance.
+
+The script demonstrates several key optimization techniques:
+
+1. **Higher target partition count**: Uses :code:`with_target_partitions()` to set the number of concurrent partitions
+2. **Automatic repartitioning**: Enables repartitioning for joins, aggregations, and window functions
+3. **Manual repartitioning**: Uses :code:`repartition()` to ensure all partitions are utilized
+4. **CPU-intensive operations**: Performs aggregations that can benefit from parallelization
+
+The benchmark creates synthetic data and measures the time taken to perform a sum aggregation
+across the specified number of partitions. This helps you understand how partition configuration
+affects performance on your specific hardware.
+
+Important Considerations
+""""""""""""""""""""""""
+
+The provided benchmark script demonstrates partitioning concepts using synthetic in-memory data
+and simple aggregation operations. While useful for understanding basic configuration principles,
+actual performance in production environments may vary significantly based on numerous factors:
+
+**Data Sources and I/O Characteristics:**
+
+- **Table providers**: Performance differs greatly between Parquet files, CSV files, databases, and cloud storage
+- **Storage type**: Local SSD, network-attached storage, and cloud storage have vastly different characteristics  
+- **Network latency**: Remote data sources introduce additional latency considerations
+- **File sizes and distribution**: Large files may benefit differently from partitioning than many small files
+
+**Query and Workload Characteristics:**
+
+- **Operation complexity**: Simple aggregations versus complex joins, window functions, or nested queries
+- **Data distribution**: Skewed data may not partition evenly, affecting parallel efficiency
+- **Memory usage**: Large datasets may require different memory management strategies
+- **Concurrent workloads**: Multiple queries running simultaneously affect resource allocation
+
+**Hardware and Environment Factors:**
+
+- **CPU architecture**: Different processors have varying parallel processing capabilities
+- **Available memory**: Limited RAM may require different optimization strategies
+- **System load**: Other applications competing for resources affect DataFusion performance
+
+**Recommendations for Production Use:**
+
+To optimize DataFusion for your specific use case, it is strongly recommended to:
+
+1. **Create custom benchmarks** using your actual data sources, formats, and query patterns
+2. **Test with representative data volumes** that match your production workloads  
+3. **Measure end-to-end performance** including data loading, processing, and result handling
+4. **Evaluate different configuration combinations** for your specific hardware and workload
+5. **Monitor resource utilization** (CPU, memory, I/O) to identify bottlenecks in your environment
+
+This approach will provide more accurate insights into how DataFusion configuration options
+will impact your particular applications and infrastructure.
+
+For more information about available :py:class:`~datafusion.context.SessionConfig` options, see the `rust DataFusion Configuration guide <https://arrow.apache.org/datafusion/user-guide/configs.html>`_,
 and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation <https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeEnvBuilder.html>`_.
@@ -145,31 +145,10 @@ To materialize the results of your DataFrame operations:
     
     # Display results
     df.show()                         # Print tabular format to console
-
+    
     # Count rows
     count = df.count()
 
-PyArrow Streaming
------------------
-
-DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling
-zero-copy streaming into libraries like `PyArrow <https://arrow.apache.org/>`_.
-Earlier versions eagerly converted the entire DataFrame when exporting to
-PyArrow, which could exhaust memory on large datasets. With streaming, batches
-are produced lazily so you can process arbitrarily large results without
-out-of-memory errors.
-
-.. code-block:: python
-
-    import pyarrow as pa
-
-    # Create a PyArrow RecordBatchReader without materializing all batches
-    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
-    for batch in reader:
-        ...  # process each batch as it is produced
-
-See :doc:`../io/arrow` for additional details on the Arrow interface.
-
 HTML Rendering
 --------------