KernelTuner
diff --git a/‎doc/source/contents.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/source/contents.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/launch_ray.sh‎
Lines changed: 34 additions & 0 deletions b/‎doc/source/launch_ray.sh‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎doc/source/parallel.rst‎
Lines changed: 150 additions & 0 deletions b/‎doc/source/parallel.rst‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎doc/source/parallel_runner.png‎
195 KB b/‎doc/source/parallel_runner.png‎
195 KB
diff --git a/‎doc/source/submit_ray.sh‎
Lines changed: 26 additions & 0 deletions b/‎doc/source/submit_ray.sh‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎examples/cuda/sepconv_parallel.py‎
Lines changed: 88 additions & 0 deletions b/‎examples/cuda/sepconv_parallel.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎examples/cuda/vector_add_parallel.py‎
Lines changed: 45 additions & 0 deletions b/‎examples/cuda/vector_add_parallel.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎kernel_tuner/backends/cupy.py‎
Lines changed: 6 additions & 0 deletions b/‎kernel_tuner/backends/cupy.py‎
Lines changed: 6 additions & 0 deletions
@@ -36,6 +36,7 @@ The Kernel Tuner documentation
    optimization
    metrics
    observers
+   parallel
 
 .. toctree::
    :maxdepth: 1
 
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Get SLURM variables
+NODELIST="${SLURM_STEP_NODELIST:-${SLURM_JOB_NODELIST:-}}"
+NUM_NODES="${SLURM_STEP_NUM_NODES:-${SLURM_JOB_NUM_NODES:-}}"
+
+if [[ -z "$NODELIST" || -z "$NUM_NODES" ]]; then
+  echo "ERROR: Not running under Slurm (missing SLURM_* vars)."
+  exit 1
+fi
+
+# Get head node
+NODES=$(scontrol show hostnames "$NODELIST")
+NODES_ARRAY=($NODES)
+RAY_IP="${NODES_ARRAY[0]}"
+RAY_PORT="${RAY_PORT:-6379}"
+RAY_ADDRESS="${RAY_IP}:${RAY_PORT}"
+
+# Ensure command exists (Ray >= 2.49 per docs)
+if ! ray symmetric-run --help >/dev/null 2>&1; then
+  echo "ERROR: 'ray symmetric-run' not available. Check Ray installation (needs Ray 2.49+)."
+  exit 1
+fi
+
+# Launch cluster!
+echo "Ray head node: $RAY_ADDRESS"
+
+exec ray symmetric-run \
+  --address "$RAY_ADDRESS" \
+  --min-nodes "$NUM_NODES" \
+  -- \
+  "$@"
+
@@ -0,0 +1,150 @@
+Parallel and Remote Tuning
+==========================
+
+By default, Kernel Tuner benchmarks GPU kernel configurations sequentially on a single local GPU.
+While this works well for small tuning problems, it can become a bottleneck for larger search spaces.
+
+.. image:: parallel_runner.png
+  :width: 700px
+  :alt: Example of sequential versus parallel tuning.
+
+
+Kernel Tuner also supports **parallel tuning**, allowing multiple GPUs to evaluate kernel configurations in parallel.
+The same mechanism can be used for **remote tuning**, where Kernel Tuner runs on a host system while one or more GPUs are located on remote machines.
+
+Parallel/remote tuning is implemented using `Ray <https://docs.ray.io/en/latest/>`_ and works on both local multi-GPU systems and distributed clusters.
+
+How to use
+----------
+
+To enable parallel tuning, pass the ``parallel`` argument to ``tune_kernel``:
+
+.. code-block:: python
+
+    kernel_tuner.tune_kernel(
+        "vector_add",
+        kernel_string,
+        size,
+        args,
+        tune_params,
+        parallel=True,
+    )
+
+If ``parallel`` is set to ``True``, Kernel Tuner will use all available Ray workers for tuning.
+The ``parallel`` option can also be set to an integer ``n`` to use exactly ``n`` workers.
+
+Alternatively, define the environment variable ``KERNEL_TUNER_PARALLEL`` to enable parallel execution without modifying your Python code.
+
+.. code-block:: bash
+
+   $ KERNEL_TUNER_PARALLEL=true python3 my_tuning_script.py
+
+
+
+Parallel tuning and optimization strategies
+-------------------------------------------
+
+The achievable speedup from using multiple GPUs depends in part on the **optimization strategy** used during tuning.
+
+Some optimization strategies support **maximum parallelism** and can evaluate all configurations independently.
+Other strategies support **limited parallelism**, typically by repeatly evaluating a fixed-size population of configurations in parallel.
+Finally, some strategies are **inherently sequential** and always evaluate configurations one by one, providing no parallelism.
+
+The current optimization strategies can be grouped as follows:
+
+* **Maximum parallelism**:
+  ``brute_force``, ``random_sample``
+
+* **Limited parallelism**:
+  ``genetic_algorithm``, ``pso``, ``diff_evo``, ``firefly_algorithm``
+
+* **No parallelism**:
+  ``minimize``, ``basinhopping``, ``greedy_mls``, ``ordered_greedy_mls``,
+  ``greedy_ils``, ``dual_annealing``, ``mls``,
+  ``simulated_annealing``, ``bayes_opt``
+
+
+
+Setting up Ray
+--------------
+
+Kernel Tuner uses `Ray <https://docs.ray.io/en/latest/>`_ to distribute kernel evaluations across multiple GPUs.
+Ray is an open-source framework for distributed computing in Python.
+
+To use parallel tuning, you must first install Ray itself:
+
+.. code-block:: bash
+
+   $ pip install ray
+
+Next, you must set up a Ray cluster.
+Kernel Tuner will internally attempt to connect to an existing cluster by calling:
+
+.. code-block:: python
+
+   ray.init(address="auto")
+
+Refer to the Ray documentation for details on how ``ray.init()`` connects to a local or remote cluster
+(`documentation <https://docs.ray.io/en/latest/ray-core/api/doc/ray.init.html>`_).
+For example, you can set the ``RAY_ADDRESS`` environment variable to point to the address of a remote Ray head node.
+Alternatively, you may manually call ``ray.init(address="your_head_node_ip:6379")`` before calling ``tune_kernel``.
+
+Here are some common ways to set up your cluster:
+
+
+Local multi-GPU machine
+***********************
+
+By default, on a machine with multiple GPUs, Ray will start a temporary local cluster and automatically detect all available GPUs.
+Kernel Tuner can then use these GPUs in parallel for tuning.
+
+
+Distributed cluster with SLURM (easy, Ray ≥2.49)
+************************************************
+
+The most straightforward way to use Ray on a SLURM cluster is to use the ``ray symmetric-run`` command, available from Ray **2.49** onwards.
+This launches a Ray environment, runs your script, and then shuts it down again.
+
+Consider the following script ``launch_ray.sh``.
+
+.. literalinclude:: launch_ray.sh
+   :language: bash
+
+Next, run your Kernel Tuner script using ``srun``.
+The exact command depends on your cluster.
+In the example below, ``-N4`` indicates 4 nodes and ``--gres=gpu:1`` indicates 1 GPU per node.
+
+.. code-block:: bash
+
+   $ srun -N4 --gres=gpu:1 launch_ray.sh python3 my_tuning_script.py
+
+
+Distributed Cluster with SLURM (manual, Ray <2.49)
+**************************************************
+
+An alternative way to use Ray on SLURM is to launch a Ray cluster, obtain the IP address of the head node, and the connect to it remotely.
+
+Consider the following sbatch script ``submit_ray.sh``.
+
+.. literalinclude:: submit_ray.sh
+   :language: bash
+
+Next, submit your job using ``sbatch``.
+
+.. code-block:: bash
+
+   $ sbatch submit_ray.sh
+   Submitted batch job 1223577
+
+After this, inspect the file `slurm-1223577.out` and search for the following line:
+
+.. code-block::
+
+   $ grep RAY_ADDRESS slurm-1223577.out
+   Launching head node: RAY_ADDRESS=145.184.221.164:6379
+
+Finally, launch your application using:
+
+.. code-block::
+
+   RAY_ADDRESS=145.184.221.164:6379 python my_tuning_script.py
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --time=00:10:00
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-task=1
+set -euo pipefail
+
+HEAD_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
+HEAD_NODE_IP=$(srun -N1 -n1 -w "$HEAD_NODE" bash -lc 'hostname -I | awk "{print \$1}"')
+RAY_PORT=6379
+RAY_ADDRESS="${HEAD_NODE_IP}:${RAY_PORT}"
+
+echo "Launching head node: RAY_ADDRESS=$RAY_ADDRESS"
+srun --nodes=1 --ntasks=1 -w "$HEAD_NODE" \
+  ray start --head --node-ip-address="$HEAD_NODE_IP" --port="$RAY_PORT" --block &
+sleep 5
+
+NUM_WORKERS=$((SLURM_JOB_NUM_NODES - 1))
+echo "Launching ${NUM_WORKERS} worker node(s)"
+if [[ "$NUM_WORKERS" -gt 0 ]]; then
+  srun -n "$NUM_WORKERS" --nodes="$NUM_WORKERS" --ntasks-per-node=1 --exclude "$HEAD_NODE" \
+    ray start --address "$RAY_ADDRESS" --block &
+fi
+
+# Keep job alive (or replace with running your workload on the head)
+wait
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+import numpy
+from kernel_tuner import tune_kernel
+from collections import OrderedDict
+
+
+def tune():
+    with open("convolution.cu", "r") as f:
+        kernel_string = f.read()
+
+    # setup tunable parameters
+    tune_params = OrderedDict()
+    tune_params["filter_height"] = [i for i in range(3, 19, 2)]
+    tune_params["filter_width"] = [i for i in range(3, 19, 2)]
+    tune_params["block_size_x"] = [16 * i for i in range(1, 65)]
+    tune_params["block_size_y"] = [2**i for i in range(6)]
+    tune_params["tile_size_x"] = [i for i in range(1, 11)]
+    tune_params["tile_size_y"] = [i for i in range(1, 11)]
+
+    tune_params["use_padding"] = [0, 1]  # toggle the insertion of padding in shared memory
+    tune_params["read_only"] = [0, 1]  # toggle using the read-only cache
+
+    # limit the search to only use padding when its effective, and at least 32 threads in a block
+    restrict = ["use_padding==0 or (block_size_x % 32 != 0)", "block_size_x*block_size_y >= 32"]
+
+    # setup input and output dimensions
+    problem_size = (4096, 4096)
+    size = numpy.prod(problem_size)
+    largest_fh = max(tune_params["filter_height"])
+    largest_fw = max(tune_params["filter_width"])
+    input_size = (problem_size[0] + largest_fw - 1) * (problem_size[1] + largest_fh - 1)
+
+    # create input data
+    output_image = numpy.zeros(size).astype(numpy.float32)
+    input_image = numpy.random.randn(input_size).astype(numpy.float32)
+    filter_weights = numpy.random.randn(largest_fh * largest_fw).astype(numpy.float32)
+
+    # setup kernel arguments
+    cmem_args = {"d_filter": filter_weights}
+    args = [output_image, input_image, filter_weights]
+
+    # tell the Kernel Tuner how to compute grid dimensions
+    grid_div_x = ["block_size_x", "tile_size_x"]
+    grid_div_y = ["block_size_y", "tile_size_y"]
+
+    # start tuning separable convolution (row)
+    tune_params["filter_height"] = [1]
+    tune_params["tile_size_y"] = [1]
+    results_row = tune_kernel(
+        "convolution_kernel",
+        kernel_string,
+        problem_size,
+        args,
+        tune_params,
+        grid_div_y=grid_div_y,
+        grid_div_x=grid_div_x,
+        cmem_args=cmem_args,
+        verbose=False,
+        restrictions=restrict,
+        parallel=True,
+        cache="convolution_kernel_row",
+    )
+
+    # start tuning separable convolution (col)
+    tune_params["filter_height"] = tune_params["filter_width"][:]
+    tune_params["file_size_y"] = tune_params["tile_size_x"][:]
+    tune_params["filter_width"] = [1]
+    tune_params["tile_size_x"] = [1]
+    results_col = tune_kernel(
+        "convolution_kernel",
+        kernel_string,
+        problem_size,
+        args,
+        tune_params,
+        grid_div_y=grid_div_y,
+        grid_div_x=grid_div_x,
+        cmem_args=cmem_args,
+        verbose=False,
+        restrictions=restrict,
+        parallel_runner=1024,
+        cache="convolution_kernel_col",
+    )
+
+    return results_row, results_col
+
+
+if __name__ == "__main__":
+    results_row, results_col = tune()
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import numpy
+from kernel_tuner import tune_kernel
+from pprint import pprint
+
+
+def tune():
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int base = ((blockIdx.x * block_size_x) + threadIdx.x) * elements_per_thread;
+
+        #pragma unroll unroll_factor
+        for (int offset = 0; offset < elements_per_thread; offset++) {
+            int i = base + offset;
+
+            if ( i < n ) {
+                c[i] = a[i] + b[i];
+            }
+        }
+    }
+    """
+
+    size = 10000000
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+
+    tune_params = dict()
+    tune_params["block_size_x"] = [32 * i for i in range(1, 33)]
+    tune_params["elements_per_thread"] = [1, 2, 3, 4, 5, 6, 7, 8]
+    tune_params["unroll_factor"] = [1, 2, 3, 4, 5, 6, 7, 8]
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, parallel=True)
+
+    pprint(env)
+    return results
+
+
+if __name__ == "__main__":
+    tune()
@@ -2,6 +2,7 @@
 from __future__ import print_function
 
 import numpy as np
+import uuid
 
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.cupy import CupyRuntimeObserver
@@ -74,12 +75,17 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
         }
         env["device_name"] = info_dict[f"Device {device} Name"]
+        env["pci_bus_id"] = info_dict[f"Device {device} PCI Bus ID"]
 
         env["cuda_version"] = cp.cuda.runtime.driverGetVersion()
         env["compute_capability"] = self.cc
         env["iterations"] = self.iterations
         env["compiler_options"] = compiler_options
         env["device_properties"] = self.devprops
+
+        props = cp.cuda.runtime.getDeviceProperties(device)
+        env["uuid"] = str(uuid.UUID(bytes=props["uuid"]))
+
         self.env = env
         self.name = env["device_name"]