Skip to content

Commit 252b248

Browse files
committed
feature: Add validation layer Timing Checker for host-side API timing
Add a new validation-layer checker (ZEL_ENABLE_TIMING_CHECKER) that measures the host-side (CPU) duration of every Level Zero API call and aggregates per-API statistics (call count, total, min, max, average in nanoseconds, and percentage share of total host time). For each API the checker stamps a high-resolution monotonic timestamp in the Prologue and reads it again in the Epilogue (QueryPerformanceCounter on Windows, CLOCK_MONOTONIC_RAW elsewhere). All APIs are covered through per-API override headers that are generated at build time from scripts/templates/validation/timing.h.mako (wired into scripts/generate_code.py). These generated headers are not checked in. Output is written directly to stderr and is independent of the loader logging system, so no ZEL_*_LOGGING variables are required. Three independently controlled modes: - summary table printed at teardown, sorted by percentage share (default) - ZEL_TIMING_CHECKER_CSV: export per-API stats to a PID-suffixed CSV - ZEL_TIMING_CHECKER_LIVE: print each call's duration as it happens Adds README documentation, a dedicated usage guide (checkers/timing/timing_checker.md), and a transparency unit test asserting the checker does not alter API results. Signed-off-by: Michal Mrozek <michal.mrozek@intel.com>
1 parent 75dc1fd commit 252b248

11 files changed

Lines changed: 694 additions & 0 deletions

File tree

scripts/generate_code.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def _mako_loader_cpp(path, namespace, tags, version, specs, meta):
182182
'handle_lifetime.h.mako' : ('handle_lifetime_tracking', 'handle_lifetime.h'),
183183
'handle_lifetime.cpp.mako' : ('handle_lifetime_tracking', 'handle_lifetime.cpp'),
184184
'certification.h.mako' : ('checkers/certification/generated', 'certification.h'),
185+
'timing.h.mako' : ('checkers/timing/generated', 'timing.h'),
185186
'to_string.h.mako' : ('../../utils', 'to_string.h'),
186187
}
187188

@@ -197,6 +198,10 @@ def _mako_validation_layer_cpp(path, namespace, tags, version, specs, meta):
197198

198199
filename = "%s_%s"%(namespace, validation_files[input_file][1])
199200
dstpath = os.path.join(dstpath, validation_files[input_file][0])
201+
# Ensure the output subdirectory exists. Generated checker outputs (e.g.
202+
# checkers/timing/generated) are not checked in, so the directory may be
203+
# absent in a clean tree.
204+
os.makedirs(dstpath, exist_ok=True)
200205
fout = os.path.join(dstpath, filename)
201206

202207
print("Generating %s..."%fout)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<%!
2+
import re
3+
from templates import helper as th
4+
%><%
5+
n=namespace
6+
N=n.upper()
7+
%>/*
8+
* ***THIS FILE IS GENERATED. ***
9+
* See timing.h.mako for modifications
10+
*
11+
* Copyright (C) 2026 Intel Corporation
12+
*
13+
* SPDX-License-Identifier: MIT
14+
*
15+
* @file ${name}
16+
*
17+
*/
18+
#pragma once
19+
#include "../zel_global_timing_state.h"
20+
#include "${n}_entry_points.h"
21+
22+
namespace validation_layer {
23+
class ${N}timingCheckerGenerated : public ${N}ValidationEntryPoints {
24+
public:
25+
%for obj in th.extract_objs(specs, r"function"):
26+
<%
27+
param_lines = [line for line in th.make_param_lines(n, tags, obj, format=["type", "name", "delim"])]
28+
is_void_params = len(param_lines) == 1 and "void" in param_lines[0]
29+
func_name = th.make_func_name(n, tags, obj)
30+
%>virtual ze_result_t ${func_name}Prologue( \
31+
%for line in param_lines:
32+
${line} \
33+
%endfor
34+
) override {
35+
GlobalTimingState::getInstance().recordStart();
36+
return ZE_RESULT_SUCCESS;
37+
}
38+
virtual ze_result_t ${func_name}Epilogue( \
39+
%if not is_void_params:
40+
%for line in param_lines:
41+
${line} \
42+
%endfor
43+
, ${obj['return_type']} result) override {
44+
%else:
45+
${obj['return_type']} result ) override {
46+
%endif
47+
GlobalTimingState::getInstance().recordEnd("${func_name}");
48+
return ZE_RESULT_SUCCESS;
49+
}
50+
%endfor
51+
%if n == 'ze':
52+
// Experimental Intel extension for counter-based events
53+
virtual ze_result_t zexCounterBasedEventCreate2Prologue( ze_context_handle_t hContext, ze_device_handle_t hDevice, const void* desc, ze_event_handle_t* phEvent ) override {
54+
GlobalTimingState::getInstance().recordStart();
55+
return ZE_RESULT_SUCCESS;
56+
}
57+
virtual ze_result_t zexCounterBasedEventCreate2Epilogue( ze_context_handle_t hContext, ze_device_handle_t hDevice, const void* desc, ze_event_handle_t* phEvent , ze_result_t result) override {
58+
GlobalTimingState::getInstance().recordEnd("zexCounterBasedEventCreate2");
59+
return ZE_RESULT_SUCCESS;
60+
}
61+
%endif
62+
};
63+
} // namespace validation_layer

source/layers/validation/README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ By default, no validation modes will be enabled. The individual validation modes
2323
- `ZE_ENABLE_THREADING_VALIDATION` (Not yet Implemented)
2424
- `ZEL_ENABLE_CERTIFICATION_CHECKER`
2525
- `ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER`
26+
- `ZEL_ENABLE_TIMING_CHECKER`
2627

2728
## Validation Modes
2829

@@ -141,6 +142,41 @@ export ZEL_LOADER_LOGGING_LEVEL=debug
141142

142143
See [System Resource Tracker documentation](checkers/system_resource_tracker/system_resource_tracker.md) for detailed usage and CSV format.
143144

145+
### `ZEL_ENABLE_TIMING_CHECKER`
146+
147+
The Timing Checker measures the host-side (CPU) duration of every Level Zero API
148+
call and aggregates per-API statistics (call count, total, min, max and average
149+
time in nanoseconds).
150+
151+
For each API call the checker stamps a high-resolution monotonic timestamp in the
152+
Prologue and reads it again in the Epilogue. The clock is
153+
`QueryPerformanceCounter` on Windows and `clock_gettime(CLOCK_MONOTONIC_RAW)`
154+
elsewhere. The measured span is dominated by the underlying driver call and is
155+
consistent across calls, making it suitable for relative host-cost analysis.
156+
157+
To enable, set:
158+
```bash
159+
export ZE_ENABLE_VALIDATION_LAYER=1
160+
export ZEL_ENABLE_TIMING_CHECKER=1
161+
```
162+
163+
Output modes (each independently controlled):
164+
165+
| Environment Variable | Default | Description |
166+
|---|---|---|
167+
| `ZEL_ENABLE_TIMING_CHECKER` | `0` | Enable the checker; a per-API summary table is printed at teardown |
168+
| `ZEL_TIMING_CHECKER_CSV` | (unset) | Also export the per-API statistics to a CSV file (the process id is appended to the filename) |
169+
| `ZEL_TIMING_CHECKER_LIVE` | `0` | Also print each call's duration as it happens (verbose) |
170+
171+
The summary and live output are written directly to `stderr` and the rows are
172+
sorted by their share of total host time (highest first). The Timing Checker does
173+
not use the loader logging system, so the `ZEL_ENABLE_LOADER_LOGGING` /
174+
`ZEL_LOADER_LOGGING_LEVEL` variables are **not** required (in particular you do
175+
not need to set logging to `trace`, which would also enable API-call logging).
176+
177+
See [Timing Checker documentation](checkers/timing/timing_checker.md) for detailed
178+
usage, output examples, and the CSV format.
179+
144180
## Testing
145181

146182
There is a small set of negative test cases designed to test the validation layer in the [level zero tests repo](https://github.com/oneapi-src/level-zero-tests/tree/master/negative_tests).

source/layers/validation/checkers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ add_subdirectory(performance)
55
add_subdirectory(parameter_validation)
66
add_subdirectory(template)
77
add_subdirectory(system_resource_tracker)
8+
add_subdirectory(timing)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (C) 2026 Intel Corporation
2+
# SPDX-License-Identifier: MIT
3+
4+
# The per-API override headers under generated/ are produced at build time from
5+
# scripts/templates/validation/timing.h.mako and are intentionally not listed
6+
# here (and not checked in); they are pulled in via #include by the sources below.
7+
target_sources(${TARGET_NAME}
8+
PRIVATE
9+
${CMAKE_CURRENT_LIST_DIR}/zel_timing_checker.h
10+
${CMAKE_CURRENT_LIST_DIR}/zel_timing_checker.cpp
11+
${CMAKE_CURRENT_LIST_DIR}/zel_global_timing_state.h
12+
)
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# Timing Checker
2+
3+
The Timing Checker is a validation-layer mode that measures the host-side (CPU)
4+
duration of every Level Zero API call and reports per-API statistics. It is
5+
useful for spotting which API calls dominate host time in an application and for
6+
comparing host overhead across runs.
7+
8+
## What it measures
9+
10+
For every API the checker takes a high-resolution monotonic timestamp when the
11+
call enters the validation layer (Prologue) and again when it returns
12+
(Epilogue). The difference is the host-side duration of that call. Per API it
13+
accumulates:
14+
15+
- **Calls** - number of times the API was called
16+
- **Total** - sum of all host durations (ns)
17+
- **Min** / **Max** - fastest and slowest single call (ns)
18+
- **Avg** - Total / Calls (ns)
19+
20+
The clock is `QueryPerformanceCounter` on Windows and
21+
`clock_gettime(CLOCK_MONOTONIC_RAW)` on other platforms.
22+
23+
## How to enable
24+
25+
The Timing Checker runs inside the validation layer, so the validation layer must
26+
be enabled. Its output is written directly to `stderr`, so the loader logging
27+
variables (`ZEL_ENABLE_LOADER_LOGGING`, `ZEL_LOADER_LOGGING_LEVEL`,
28+
`ZEL_LOADER_LOG_CONSOLE`) are **not** required.
29+
30+
```bash
31+
export ZE_ENABLE_VALIDATION_LAYER=1 # turn the validation layer on
32+
export ZEL_ENABLE_TIMING_CHECKER=1 # enable the Timing Checker
33+
34+
./your_level_zero_application
35+
```
36+
37+
On Windows set the same variables with `set` instead of `export`.
38+
39+
## Output modes
40+
41+
The three modes are independent and can be combined. All output goes to `stderr`.
42+
43+
| Environment Variable | Default | Effect |
44+
|---|---|---|
45+
| `ZEL_ENABLE_TIMING_CHECKER` | `0` | Enable the checker; a per-API summary table is printed at teardown |
46+
| `ZEL_TIMING_CHECKER_CSV` | (unset) | Path to also export the per-API statistics as CSV |
47+
| `ZEL_TIMING_CHECKER_LIVE` | `0` | Also print every individual call's duration as it happens |
48+
49+
### 1. Summary table (default when enabled)
50+
51+
Printed to `stderr` at process teardown. Rows are sorted by `%`, each API's
52+
share of total host time, highest first:
53+
54+
```
55+
==== Level Zero Host API Timing (ns) ====
56+
Function Calls Total Min Max Avg %
57+
zeInitDrivers 2 65581869 666 65581203 32790934 95.84%
58+
zeCommandListCreateImmediate 1 2783378 2783378 2783378 2783378 4.07%
59+
zeCommandListAppendSignalEvent 1 1928777 1928777 1928777 1928777 2.82%
60+
zeDeviceGetMemoryProperties 2 51819 718 51101 25909 0.08%
61+
zeContextCreate 1 4219 4219 4219 4219 0.01%
62+
zeDeviceGet 2 682 198 484 341 0.00%
63+
```
64+
65+
### 2. CSV export (`ZEL_TIMING_CHECKER_CSV`)
66+
67+
```bash
68+
export ZEL_TIMING_CHECKER_CSV=timing.csv
69+
```
70+
71+
The process id is appended to the filename so concurrent processes do not
72+
overwrite each other (for example `timing_2449615.csv`). One row per API, sorted
73+
by `%` (share of total host time) descending. The `%` column is a percentage value:
74+
75+
```csv
76+
Function,Calls,TotalNs,MinNs,MaxNs,AvgNs,%
77+
zeInitDrivers,2,65581869,666,65581203,32790934,95.84
78+
zeCommandListCreateImmediate,1,2783378,2783378,2783378,2783378,4.07
79+
zeContextCreate,1,4219,4219,4219,4219,0.01
80+
```
81+
82+
### 3. Live per-call logging (`ZEL_TIMING_CHECKER_LIVE`)
83+
84+
```bash
85+
export ZEL_TIMING_CHECKER_LIVE=1
86+
```
87+
88+
Each call is printed to `stderr` as it completes (verbose):
89+
90+
```
91+
[timing] zeInitDrivers 66196062 ns
92+
[timing] zeDeviceGet 566 ns
93+
[timing] zeDeviceGetProperties 1301 ns
94+
```
95+
96+
## Interpreting results
97+
98+
- The measured span includes the underlying driver call plus the small overhead
99+
of any other validation handlers, so it is best used for relative comparison
100+
(which APIs cost the most, run-to-run deltas) rather than as an absolute
101+
driver-only timing.
102+
- First-call costs such as `zeInit` / `zeInitDrivers` are typically much larger
103+
than steady-state calls because they perform one-time driver initialization.
104+
105+
## Platform support
106+
107+
Linux and Windows. The checker has no external dependencies and works with any
108+
Level Zero driver, including the null driver used by the loader tests.
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
*
3+
* Copyright (C) 2026 Intel Corporation
4+
*
5+
* SPDX-License-Identifier: MIT
6+
*
7+
* @file zel_global_timing_state.h
8+
*
9+
*/
10+
11+
#pragma once
12+
13+
#include <cstdint>
14+
#include <map>
15+
#include <mutex>
16+
#include <string>
17+
18+
#include "ze_api.h"
19+
20+
namespace validation_layer {
21+
22+
// Per-API host timing aggregation. All values are in nanoseconds
23+
// except call_count.
24+
struct FunctionTime {
25+
uint64_t total_time = 0;
26+
uint64_t min_time = 0;
27+
uint64_t max_time = 0;
28+
uint64_t call_count = 0;
29+
};
30+
31+
// Engine for the timing checker. Records the host-side duration of every L0 API
32+
// call (the span between its Prologue and Epilogue) and aggregates per-API
33+
// statistics. A single instance is shared by the generated ze/zes/zet/zer
34+
// timing checkers.
35+
//
36+
// The host clock primitive is a high-resolution monotonic counter
37+
// (QueryPerformanceCounter on Windows, CLOCK_MONOTONIC_RAW elsewhere).
38+
class GlobalTimingState {
39+
public:
40+
static GlobalTimingState &getInstance() {
41+
static GlobalTimingState instance;
42+
return instance;
43+
}
44+
45+
// Configure output modes from environment variables. Called once when the
46+
// checker is enabled.
47+
void configure();
48+
49+
// Prologue hook: stamp the per-thread start time.
50+
void recordStart();
51+
52+
// Epilogue hook: compute the elapsed host time for this call and fold it
53+
// into the per-API statistics. Optionally emits a live log line.
54+
void recordEnd(const char *functionName);
55+
56+
// Teardown reporting.
57+
void printSummary();
58+
void writeCsv();
59+
60+
bool liveLoggingEnabled() const { return liveLogging_; }
61+
62+
// High-resolution monotonic host timestamp in nanoseconds.
63+
static uint64_t getHostTimestampNs();
64+
65+
private:
66+
GlobalTimingState() = default;
67+
68+
std::mutex stats_mutex_;
69+
std::map<std::string, FunctionTime> stats_;
70+
71+
bool liveLogging_ = false;
72+
std::string csvPath_;
73+
74+
// Single-slot per-thread start time. L0 host APIs are not
75+
// re-entrant through the validation layer, so no stack is required; a
76+
// value orphaned by another handler's failing Prologue is simply
77+
// overwritten by the next Prologue.
78+
static thread_local uint64_t apiStartTime_;
79+
};
80+
81+
} // namespace validation_layer

0 commit comments

Comments
 (0)