Also estimate the last_level_cache size for Vulkan devices

antonysigma · antonysigma · commit 56bff19aaff0 · 2025-07-17T10:54:13.000-07:00
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
@@ -24,7 +24,7 @@ if(Halide_TARGET MATCHES "cuda|metal")
     list(APPEND _camera_pipe_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(Halide_TARGET MATCHES "opencl")
+elseif(Halide_TARGET MATCHES "opencl|vulkan")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(harris.generator SOURCES harris_generator.cpp)
 
+set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "opencl|metal")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _harris_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_harris_autoscheduler_params})
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
@@ -24,12 +24,12 @@ if(Halide_TARGET MATCHES "cuda")
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(Halide_TARGET MATCHES "metal|opencl")
-    # Set last_level_cache per GPU block to an extremely small value. This
-    # eliminates all `.compute_at` in the generated schedules, which in turn
-    # eliminates all GPU shared memory allocations.
+elseif(Halide_TARGET MATCHES "metal|opencl|vulkan")
+    # The pipeline is shared GPU memory bounded. Limit the parallelism to
+    # minimal value (=32) to cap the shared GPU memory size.
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=1000
+        autoscheduler.parallelism=32
     )
 endif()
 

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ if(Halide_TARGET MATCHES "cuda\|metal")`
`24`	`24`	`list(APPEND _camera_pipe_autoscheduler_params`
`25`	`25`	`autoscheduler.last_level_cache_size=10000`
`26`	`26`	`)`
`27`		`-elseif(Halide_TARGET MATCHES "opencl")`
	`27`	`+elseif(Halide_TARGET MATCHES "opencl\|vulkan")`
`28`	`28`	`# Set last_level_cache per GPU block to an extremely small value. This`
`29`	`29`	# eliminates all `.compute_at` in the generated schedules, which in turn
`30`	`30`	`# eliminates all GPU shared memory allocations.`