Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ CMakeUserPresets.json
.cline_storage
config.mk
.claude/
.venv/
1 change: 1 addition & 0 deletions Applications/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ add_subdirectory(convolution)
add_subdirectory(floyd_warshall)
add_subdirectory(histogram)
add_subdirectory(prefix_sum)
add_subdirectory(optical_flow)

if(NOT CXX_FS_HEADER_FOUND)
message(WARNING "filesystem not found, not building fdtd example")
Expand Down
2 changes: 2 additions & 0 deletions Applications/optical_flow/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
applications_optical_flow
*.flo
53 changes: 53 additions & 0 deletions Applications/optical_flow/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# MIT License
#
# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

set(example_name applications_optical_flow)

cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)

include("${CMAKE_CURRENT_LIST_DIR}/../../Common/HipPlatform.cmake")
select_gpu_language()

enable_language(${ROCM_EXAMPLES_GPU_LANGUAGE})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_${ROCM_EXAMPLES_GPU_LANGUAGE}_STANDARD 17)
set(CMAKE_${ROCM_EXAMPLES_GPU_LANGUAGE}_EXTENSIONS OFF)
set(CMAKE_${ROCM_EXAMPLES_GPU_LANGUAGE}_STANDARD_REQUIRED ON)
select_hip_platform()

include("${CMAKE_CURRENT_LIST_DIR}/../../Common/ROCmPath.cmake")

add_executable(${example_name} main.hip flowHIP.hip flowGold.cpp)
add_test(NAME ${example_name} COMMAND ${example_name})

set(include_dirs "../../Common")
if(ROCM_EXAMPLES_HIP_PLATFORM STREQUAL "nvidia")
list(APPEND include_dirs "${ROCM_PATH}/include")
endif()

target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip flowHIP.hip PROPERTIES LANGUAGE ${ROCM_EXAMPLES_GPU_LANGUAGE})

install(TARGETS ${example_name})
61 changes: 61 additions & 0 deletions Applications/optical_flow/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# MIT License
#
# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

EXAMPLE := applications_optical_flow
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME ?= HIP

ROCM_PATH ?= /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_PATH)/include

HIPCXX ?= $(ROCM_PATH)/bin/hipcc

CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=

ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif

ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)

$(EXAMPLE): main.hip flowHIP.hip flowGold.cpp $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ main.hip flowHIP.hip flowGold.cpp $(ILDLIBS)

test: $(EXAMPLE)
./$(EXAMPLE) $(TEST_ARGS)

clean:
$(RM) $(EXAMPLE)

.PHONY: clean test
103 changes: 103 additions & 0 deletions Applications/optical_flow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Optical Flow

## Description

This example implements the **Horn-Schunck variational optical flow** algorithm using HIP. It estimates the motion field (displacement vectors) between two consecutive image frames by minimizing a global energy functional that balances data fidelity and spatial smoothness.

The algorithm operates on a Gaussian image pyramid: flow is computed coarse-to-fine, with each level refining the estimate from the level above. At each pyramid level, image warping aligns the target frame with the source, and a Jacobi iterative solver computes the incremental flow update.

The program computes optical flow on both the CPU (`flowGold`) and GPU (`flowHIP`), compares the results via L1 norm, and writes two `.flo` files (Middlebury format) for inspection.

## Application Flow

1. Load two consecutive frames (`data/frame10.ppm`, `data/frame11.ppm`) as single-channel FP32 images.
2. Build a Gaussian pyramid with `nLevels` levels by repeatedly downscaling with a 4-tap filter.
3. At each pyramid level (coarse to fine):
- Upscale the flow estimate from the coarser level.
- Warp the target image toward the source using the current flow estimate.
- Compute image derivatives (Ix, Iy, Iz) via finite differences.
- Run `nSolverIters` Jacobi iterations to solve for the incremental flow update.
- Repeat for `nWarpIters` warping passes.
4. Copy GPU results to host and compare against the CPU reference (L1 norm per pixel).
5. Write `FlowGPU.flo` and `FlowCPU.flo` to disk.

## Key APIs and Concepts

| Concept | HIP API |
|---|---|
| Texture objects with bilinear filtering | `hipCreateTextureObject`, `hipTextureObject_t` |
| Pitched 2D texture resource | `hipResourceTypePitch2D`, `hipResourceDesc` |
| Mirror address mode | `hipAddressModeMirror` |
| Normalized texture coordinates | `texDescr.normalizedCoords = true` |
| In-kernel texture fetch | `tex2D<float>(tex, x, y)` |
| Block synchronization | `cg::this_thread_block()`, `cg::sync()` |

### Pitch Alignment Requirement

ROCm requires `pitchInBytes` for `hipResourceTypePitch2D` to be a multiple of **256 bytes** (64 floats × 4 bytes). The `StrideAlignment` constant in `common.h` is set to `64` to satisfy this constraint. CUDA only requires 128 bytes (32 floats), so porting code that used `StrideAlignment = 32` will fail at texture creation.

## Prerequisites

- A ROCm-capable AMD GPU
- ROCm SDK installed ([installation guide](https://rocm.docs.amd.com/en/latest/index.html) or [TheRock releases](https://github.com/ROCm/TheRock/blob/main/RELEASES.md))
- `hipcc` on your `PATH`

## Building

### Make

```bash
cd Applications/optical_flow
make
```

### CMake

```bash
cd Applications/optical_flow
cmake -B build -DROCM_PATH=<path-to-rocm-sdk> -DCMAKE_HIP_COMPILER=<path-to-hipcc>
cmake --build build -j$(nproc)
```

Replace `<path-to-rocm-sdk>` and `<path-to-hipcc>` with the actual paths for your ROCm installation, for example `/opt/rocm` and `/opt/rocm/bin/hipcc`.

## Running

The binary expects two PPM images at `data/frame10.ppm` and `data/frame11.ppm` relative to the working directory. Sample frames from the Middlebury dataset work well.

```bash
#!/bin/bash
# From the optical_flow directory (Make build)
./optical_flow

# Or if built with CMake
./build/applications_optical_flow
```

### Expected Output

```text
HSOpticalFlow Starting...

Using device: <GPU name>
Loading "data/frame10.ppm" ...
Loading "data/frame11.ppm" ...
L1 error : 0.000xxx
```

The program exits with `EXIT_SUCCESS` when the L1 error between the GPU and CPU results is below `0.05`. Two output files are written:

- `FlowGPU.flo` — GPU optical flow result
- `FlowCPU.flo` — CPU reference result

Both files use the [Middlebury `.flo` format](http://vision.middlebury.edu/flow/code/flow-code/README.txt) and can be visualized with tools such as `flowiz` or the Middlebury flow utilities.

## Key Notes

- StrideAlignment is 64 since `hipResourceTypePitch2D` requires `pitchInBytes` to be a multiple of `256 bytes` (4 bytes * 64 floats = 256 bytes)
- If you are running ROCm through Python packages and having trouble with compiling due to the program not being able to find dependencies, please run the following in your virtual environment:

```bash
#!/bin/bash
export ROCM_PATH=PATH_TO_VENV/.venv/lib/python3.12/site-packages/_rocm_sdk_devel
```
57 changes: 57 additions & 0 deletions Applications/optical_flow/addKernel.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// MIT License
//
// Copyright (c) 2023-2026 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include "common.h"

///////////////////////////////////////////////////////////////////////////////
/// \brief add two vectors of size _count_
///
/// HIP kernel
/// \param[in] op1 term one
/// \param[in] op2 term two
/// \param[in] count vector size
/// \param[out] sum result
///////////////////////////////////////////////////////////////////////////////
__global__ void AddKernel(const float *op1, const float *op2, int count, float *sum)
{
const int pos = threadIdx.x + blockIdx.x * blockDim.x;

if (pos >= count)
return;

sum[pos] = op1[pos] + op2[pos];
}

///////////////////////////////////////////////////////////////////////////////
/// \brief add two vectors of size _count_
/// \param[in] op1 term one
/// \param[in] op2 term two
/// \param[in] count vector size
/// \param[out] sum result
///////////////////////////////////////////////////////////////////////////////
static void Add(const float *op1, const float *op2, int count, float *sum)
{
dim3 threads(256);
dim3 blocks(iDivUp(count, threads.x));

AddKernel<<<blocks, threads>>>(op1, op2, count, sum);
}
50 changes: 50 additions & 0 deletions Applications/optical_flow/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// MIT License
//
// Copyright (c) 2023-2026 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#pragma once

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>

// HIP hipResourceTypePitch2D requires pitchInBytes to be a multiple of 256 bytes.
// 64 floats * 4 bytes = 256 bytes satisfies this
constexpr int STRIDE_ALIGNMENT = 64;

inline int iAlignUp(int n, int m = STRIDE_ALIGNMENT)
{
int mod = n % m;
if (mod)
return n + m - mod;
else
return n;
}

inline int iDivUp(int n, int m) { return (n + m - 1) / m; }

template <typename T> inline void Swap(T &a, T &b)
{
T t = a;
a = b;
b = t;
}
Loading
Loading