Skip to content

Commit 1fbd643

Browse files
committed
Checker: System Resource Monitor
- New ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER which enables for users to track the system resource usage with relation to Level Zero apis. - Can log to a level zero logger file or to a .csv for plottign using the included plot_resource_tracker.py - Currently limited to Linux support. Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
1 parent 5172a58 commit 1fbd643

8 files changed

Lines changed: 2183 additions & 0 deletions

File tree

scripts/plot_resource_tracker.py

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
"""
2+
Copyright (C) 2025 Intel Corporation
3+
4+
SPDX-License-Identifier: MIT
5+
6+
"""
7+
#!/usr/bin/env python3
8+
"""
9+
Plot system resource tracking data from Level Zero resource tracker CSV output.
10+
11+
Usage:
12+
python3 plot_resource_tracker.py <csv_file>
13+
14+
Example:
15+
export ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER=1
16+
export ZEL_SYSTEM_RESOURCE_TRACKER_CSV=tracker_output.csv
17+
export ZEL_ENABLE_LOADER_LOGGING=1
18+
export ZEL_LOADER_LOGGING_LEVEL=debug
19+
./my_level_zero_app
20+
python3 plot_resource_tracker.py tracker_output.csv
21+
"""
22+
23+
import sys
24+
import pandas as pd
25+
import matplotlib.pyplot as plt
26+
from pathlib import Path
27+
28+
def plot_resource_tracker(csv_file):
29+
"""Plot resource tracking data from CSV file."""
30+
31+
# Read CSV file
32+
df = pd.read_csv(csv_file)
33+
34+
# Convert time from milliseconds to seconds
35+
df['TimeSec'] = df['TimeMs'] / 1000.0
36+
37+
# Create figure with multiple subplots
38+
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
39+
fig.suptitle(f'Level Zero System Resource Tracking\n{Path(csv_file).name}', fontsize=16)
40+
41+
# Plot 1: Memory Usage Over Time (VmRSS, VmSize, VmData)
42+
ax1 = axes[0, 0]
43+
ax1.plot(df['TimeSec'], df['VmRSS_KB'], label='VmRSS', linewidth=2)
44+
ax1.plot(df['TimeSec'], df['VmSize_KB'], label='VmSize', linewidth=2, alpha=0.7)
45+
ax1.plot(df['TimeSec'], df['VmData_KB'], label='VmData', linewidth=2, alpha=0.7)
46+
ax1.set_xlabel('Time (s)')
47+
ax1.set_ylabel('Memory (KB)')
48+
ax1.set_title('System Memory Usage Over Time')
49+
ax1.legend()
50+
ax1.grid(True, alpha=0.3)
51+
52+
# Plot 2: Memory Deltas (showing per-call changes)
53+
ax2 = axes[0, 1]
54+
ax2.plot(df['TimeSec'], df['Delta_VmRSS_KB'] / 1024, label='Delta VmRSS', linewidth=1.5)
55+
ax2.plot(df['TimeSec'], df['Delta_VmSize_KB'] / 1024, label='Delta VmSize', linewidth=1.5, alpha=0.7)
56+
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
57+
ax2.set_xlabel('Time (s)')
58+
ax2.set_ylabel('Memory Change (MB)')
59+
ax2.set_title('Memory Deltas Per API Call')
60+
ax2.legend()
61+
ax2.grid(True, alpha=0.3)
62+
63+
# Plot 3: Level Zero Resource Counts
64+
ax3 = axes[1, 0]
65+
if df['Contexts'].max() > 0:
66+
ax3.plot(df['TimeSec'], df['Contexts'], label='Contexts', linewidth=2)
67+
if df['CommandQueues'].max() > 0:
68+
ax3.plot(df['TimeSec'], df['CommandQueues'], label='CommandQueues', linewidth=2)
69+
if df['Modules'].max() > 0:
70+
ax3.plot(df['TimeSec'], df['Modules'], label='Modules', linewidth=2)
71+
if df['Kernels'].max() > 0:
72+
ax3.plot(df['TimeSec'], df['Kernels'], label='Kernels', linewidth=2)
73+
ax3.set_xlabel('Time (s)')
74+
ax3.set_ylabel('Resource Count')
75+
ax3.set_title('L0 Resource Counts (Contexts, Queues, Modules, Kernels)')
76+
ax3.legend()
77+
ax3.grid(True, alpha=0.3)
78+
79+
# Plot 4: Command Lists and Event Resources
80+
ax4 = axes[1, 1]
81+
if df['CommandLists'].max() > 0:
82+
ax4.plot(df['TimeSec'], df['CommandLists'], label='CommandLists', linewidth=2)
83+
if df['EventPools'].max() > 0:
84+
ax4.plot(df['TimeSec'], df['EventPools'], label='EventPools', linewidth=2)
85+
if df['Events'].max() > 0:
86+
ax4.plot(df['TimeSec'], df['Events'], label='Events', linewidth=2)
87+
if df['Fences'].max() > 0:
88+
ax4.plot(df['TimeSec'], df['Fences'], label='Fences', linewidth=2)
89+
ax4.set_xlabel('Time (s)')
90+
ax4.set_ylabel('Resource Count')
91+
ax4.set_title('L0 Command Lists and Events')
92+
ax4.legend()
93+
ax4.grid(True, alpha=0.3)
94+
95+
# Plot 5: Total Memory Allocations
96+
ax5 = axes[2, 0]
97+
ax5.plot(df['TimeSec'], df['TotalMemory_Bytes'] / (1024*1024), label='Total Memory',
98+
linewidth=2, color='red')
99+
ax5.set_xlabel('Time (s)')
100+
ax5.set_ylabel('Memory (MB)')
101+
ax5.set_title('Total L0 Memory Allocations')
102+
ax5.legend()
103+
ax5.grid(True, alpha=0.3)
104+
105+
# Plot 6: API Call Distribution (top 10 most frequent)
106+
ax6 = axes[2, 1]
107+
api_counts = df['APICall'].value_counts().head(10).sort_values(ascending=True)
108+
api_counts.plot(kind='barh', ax=ax6, color='steelblue')
109+
ax6.set_xlabel('Call Count')
110+
ax6.set_title('Top 10 Most Frequent API Calls')
111+
ax6.grid(True, alpha=0.3, axis='x')
112+
113+
# Plot 7: Top 10 API Calls by Memory Usage
114+
ax7 = axes[2, 2]
115+
# Calculate total memory delta per API call type
116+
memory_by_api = (df.groupby('APICall')['Delta_VmRSS_KB'].sum() / 1024).sort_values(ascending=True).tail(10)
117+
memory_by_api.plot(kind='barh', ax=ax7, color='coral')
118+
ax7.set_xlabel('Total Memory Delta (MB)')
119+
ax7.set_title('Top 10 API Calls by Memory Impact')
120+
ax7.grid(True, alpha=0.3, axis='x')
121+
122+
# Plot 8: Memory Usage by API Call (average per call)
123+
ax8 = axes[1, 2]
124+
# Calculate average memory delta per API call type
125+
avg_memory_by_api = (df.groupby('APICall')['Delta_VmRSS_KB'].mean() / 1024).sort_values(ascending=True).tail(10)
126+
avg_memory_by_api.plot(kind='barh', ax=ax8, color='mediumseagreen')
127+
ax8.set_xlabel('Avg Memory Delta per Call (MB)')
128+
ax8.set_title('Top 10 API Calls by Avg Memory per Call')
129+
ax8.grid(True, alpha=0.3, axis='x')
130+
131+
# Plot 9: Cumulative memory by API over time
132+
ax9 = axes[0, 2]
133+
# Get top 5 API calls by total memory impact
134+
top5_apis = df.groupby('APICall')['Delta_VmRSS_KB'].sum().nlargest(5).index
135+
for api in top5_apis:
136+
api_data = df[df['APICall'] == api]
137+
ax9.plot(api_data['TimeSec'], (api_data['Delta_VmRSS_KB'].cumsum() / 1024), label=api, linewidth=2)
138+
ax9.set_xlabel('Time (s)')
139+
ax9.set_ylabel('Cumulative Memory Delta (MB)')
140+
ax9.set_title('Cumulative Memory Impact by Top 5 APIs')
141+
ax9.legend(fontsize=8)
142+
ax9.grid(True, alpha=0.3)
143+
144+
plt.tight_layout()
145+
146+
# Save plot
147+
output_file = Path(csv_file).stem + '_plot.png'
148+
plt.savefig(output_file, dpi=150, bbox_inches='tight')
149+
print(f"Plot saved to: {output_file}")
150+
151+
# Show plot
152+
plt.show()
153+
154+
# Print summary statistics
155+
print("\n=== Summary Statistics ===")
156+
print(f"Total API calls tracked: {len(df)}")
157+
print(f"Time span: {df['TimeSec'].max():.2f} seconds ({df['TimeMs'].max():.2f} ms)")
158+
print(f"Peak VmRSS: {df['VmRSS_KB'].max():.2f} KB ({df['VmRSS_KB'].max()/1024:.2f} MB)")
159+
print(f"Peak VmSize: {df['VmSize_KB'].max():.2f} KB ({df['VmSize_KB'].max()/1024:.2f} MB)")
160+
print(f"Total memory allocated: {df['TotalMemory_Bytes'].max():.2f} bytes "
161+
f"({df['TotalMemory_Bytes'].max()/(1024*1024):.2f} MB)")
162+
print(f"Number of threads: {df['Threads'].max()}")
163+
print(f"\nPeak resource counts:")
164+
print(f" Contexts: {df['Contexts'].max()}")
165+
print(f" CommandQueues: {df['CommandQueues'].max()}")
166+
print(f" Modules: {df['Modules'].max()}")
167+
print(f" Kernels: {df['Kernels'].max()}")
168+
print(f" CommandLists: {df['CommandLists'].max()}")
169+
print(f" Events: {df['Events'].max()}")
170+
171+
# Print top API calls by memory usage
172+
print(f"\n=== Top 10 API Calls by Total Memory Impact ===")
173+
memory_by_api = df.groupby('APICall')['Delta_VmRSS_KB'].sum().sort_values(ascending=False).head(10)
174+
for api, mem in memory_by_api.items():
175+
print(f" {api}: {mem:.2f} KB ({mem/1024:.2f} MB)")
176+
177+
print(f"\n=== Top 10 API Calls by Average Memory per Call ===")
178+
avg_memory_by_api = df.groupby('APICall')['Delta_VmRSS_KB'].mean().sort_values(ascending=False).head(10)
179+
for api, mem in avg_memory_by_api.items():
180+
count = len(df[df['APICall'] == api])
181+
print(f" {api}: {mem:.2f} KB/call ({count} calls)")
182+
183+
if __name__ == '__main__':
184+
if len(sys.argv) != 2:
185+
print(__doc__)
186+
sys.exit(1)
187+
188+
csv_file = sys.argv[1]
189+
if not Path(csv_file).exists():
190+
print(f"Error: File '{csv_file}' not found")
191+
sys.exit(1)
192+
193+
try:
194+
import pandas
195+
import matplotlib
196+
except ImportError as e:
197+
print(f"Error: Required Python packages not installed")
198+
print(f"Install with: pip install pandas matplotlib")
199+
sys.exit(1)
200+
201+
plot_resource_tracker(csv_file)

source/layers/validation/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ By default, no validation modes will be enabled. The individual validation modes
2222
- `ZEL_ENABLE_BASIC_LEAK_CHECKER`
2323
- `ZE_ENABLE_THREADING_VALIDATION` (Not yet Implemented)
2424
- `ZEL_ENABLE_CERTIFICATION_CHECKER`
25+
- `ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER`
2526

2627
## Validation Modes
2728

@@ -89,6 +90,40 @@ Validates:
8990
When this mode is enabled, the certification checker validates API usage against the version supported by the driver or an explicitly specified version.
9091
If an API is used that was introduced in a version higher than the supported version, the checker will return `ZE_RESULT_ERROR_UNSUPPORTED_VERSION`.
9192

93+
### `ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER` (Linux Only)
94+
95+
The System Resource Tracker monitors both Level Zero API resources and system resources in real-time. It tracks:
96+
97+
- **L0 Resources**: Contexts, command queues, modules, kernels, event pools, command lists, events, fences, images, samplers, and memory allocations
98+
- **System Metrics**: Virtual memory (VmSize, VmRSS, VmData, VmPeak), thread count, file descriptors
99+
- **Deltas**: Resource changes for each API call
100+
- **Cumulative Totals**: Running summaries of all resource types
101+
102+
The tracker can log to the Level Zero debug log and optionally export data to CSV for graphing and analysis:
103+
104+
```bash
105+
export ZE_ENABLE_VALIDATION_LAYER=1
106+
export ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER=1
107+
export ZEL_SYSTEM_RESOURCE_TRACKER_CSV=tracker_output.csv # Optional: enable CSV export
108+
export ZEL_ENABLE_LOADER_LOGGING=1
109+
export ZEL_LOADER_LOGGING_LEVEL=debug
110+
```
111+
112+
**CSV Output Features:**
113+
- Per-process unique filenames (PID appended automatically)
114+
- 22 columns of metrics including timestamps, system resources, L0 resource counts, and deltas
115+
- Atomic line writes for thread safety
116+
- Companion Python plotting script (`scripts/plot_resource_tracker.py`) for visualization
117+
118+
**Use Cases:**
119+
- Performance analysis and memory leak detection
120+
- Resource lifecycle tracking and optimization
121+
- Debugging and benchmarking
122+
- CI/CD integration for automated resource monitoring
123+
124+
**Platform Support:** This checker is Linux-only and uses `/proc/self/status` for system metrics. It is automatically excluded from Windows and macOS builds.
125+
126+
See [System Resource Tracker documentation](checkers/system_resource_tracker/system_resource_tracker.md) for detailed usage and CSV format.
92127

93128
## Testing
94129

source/layers/validation/checkers/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,8 @@ add_subdirectory(certification)
33
add_subdirectory(events_checker)
44
add_subdirectory(parameter_validation)
55
add_subdirectory(template)
6+
7+
# System resource tracker is Linux-only (uses /proc/self/status)
8+
if(UNIX AND NOT APPLE)
9+
add_subdirectory(system_resource_tracker)
10+
endif()
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# System resource tracker is Linux-only (uses /proc/self/status)
2+
if(UNIX AND NOT APPLE)
3+
target_sources(${TARGET_NAME}
4+
PRIVATE
5+
${CMAKE_CURRENT_LIST_DIR}/zel_system_resource_tracker_checker.h
6+
${CMAKE_CURRENT_LIST_DIR}/zel_system_resource_tracker_checker.cpp
7+
)
8+
endif()
380 KB
Loading

0 commit comments

Comments
 (0)