Skip to content

Commit 85b619b

Browse files
Profiler - Add partition support
NOTE: GPU ordering used is not the same as in HSA/HIP. GPUs are ordered via amdsmi and then GPU_ID fields are compared to map GPU partitions to each other. Change-Id: If379214f5281d7d5ee98515b3e5ba7affc2e2197 Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
1 parent 2adc8f8 commit 85b619b

9 files changed

Lines changed: 170 additions & 135 deletions

File tree

common/rdc_field.data

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ FLD_DESC_ENT(RDC_FI_REV_ID, "Revision ID of the device",
4545
FLD_DESC_ENT(RDC_FI_TARGET_GRAPHICS_VERSION, "GFX version of the device", "GFX", true)
4646
FLD_DESC_ENT(RDC_FI_NUM_OF_COMPUTE_UNITS, "Number of Compute Units", "COMPUTE_UNITS", true)
4747
FLD_DESC_ENT(RDC_FI_UUID, "Unique ID of the device AKA asic_serial", "UUID", true)
48+
FLD_DESC_ENT(RDC_FI_GPU_PARTITION_COUNT, "GPU partition count", "PARTITION_COUNT", true)
4849

4950
FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true)
5051
FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true)
@@ -136,25 +137,25 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size acr
136137
// This doesn't map to rocprofiler counters directly
137138
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
138139
// See metrics.xml in rocprofiler
139-
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", false)
140-
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
141-
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
142-
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
143-
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false)
144-
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false)
140+
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", true)
141+
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", true)
142+
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", true)
143+
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", true)
144+
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", true)
145+
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", true)
145146
// metrics with EVAL are divided by time passed
146-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
147-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
148-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
149-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
150-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
151-
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", false)
152-
FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", false)
153-
FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", false)
154-
FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", false)
155-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", false)
156-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", false)
157-
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", false)
147+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", true)
148+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", true)
149+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", true)
150+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", true)
151+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", true)
152+
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", true)
153+
FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", true)
154+
FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", true)
155+
FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", true)
156+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", true)
157+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", true)
158+
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", true)
158159
// CPC
159160
FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_BUSY, "", "CPC_CPC_STAT_BUSY", false)
160161
FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_IDLE, "", "CPC_CPC_STAT_IDLE", false)
@@ -194,7 +195,8 @@ FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_I
194195
FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false)
195196
// Misc
196197
FLD_DESC_ENT(RDC_FI_PROF_SIMD_UTILIZATION, "Fraction of time the SIMDs are being utilized", "SIMD_UTILIZATION", false)
197-
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", false)
198+
FLD_DESC_ENT(RDC_FI_PROF_UUID, "UUID from rocprofiler", "PROF_UUID", true)
199+
FLD_DESC_ENT(RDC_FI_PROF_KFD_ID, "GPU_ID from rocprofiler, same as KFD_ID", "PROF_KFD_ID", true)
198200

199201
// Events
200202
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)

include/rdc/rdc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ typedef enum {
173173
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
174174
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
175175
RDC_FI_UUID, //!< Device UUID
176+
RDC_FI_GPU_PARTITION_COUNT,
176177

177178
/**
178179
* @brief Frequency related fields
@@ -344,6 +345,7 @@ typedef enum {
344345
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
345346
RDC_FI_PROF_SIMD_UTILIZATION,
346347
RDC_FI_PROF_UUID,
348+
RDC_FI_PROF_KFD_ID,
347349

348350
/**
349351
* @brief Raw XGMI counter events

include/rdc_modules/rdc_rocp/RdcRocpBase.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,22 @@ class RdcRocpBase {
6969
*/
7070
static const uint32_t collection_duration_us_k = 10000;
7171

72-
double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index);
73-
7472
/**
7573
* @brief By default all profiler values are read as doubles
7674
*/
77-
double run_profiler(uint32_t gpu_index, rdc_field_t field);
78-
void map_smi_to_profiler_by_uuid();
75+
double run_profiler(uint32_t agent_index, rdc_field_t field);
76+
77+
/**
78+
* @description Create a map from entity_id to profiler agent_index.
79+
* This is required due to different structure and ordering.
80+
* Populates entity_to_prof_map.
81+
*/
82+
rdc_status_t map_entity_to_profiler();
7983

8084
std::vector<rocprofiler_agent_v0_t> agents = {};
8185
std::vector<std::shared_ptr<CounterSampler>> samplers = {};
8286
std::map<rdc_field_t, const char*> field_to_metric = {};
83-
std::map<uint32_t, uint32_t> smi_to_profiler_map = {};
87+
std::map<uint32_t, uint32_t> entity_to_prof_map = {};
8488

8589
// these fields must be divided by time passed
8690
std::unordered_set<rdc_field_t> eval_fields = {

rdc_libs/rdc/src/RdcEmbeddedHandler.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_
178178
if (!count) {
179179
return RDC_ST_BAD_PARAMETER;
180180
}
181+
181182
rdc_field_value device_count;
182183
rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
183184
if (status != RDC_ST_OK) {

rdc_libs/rdc/src/RdcMetricFetcherImpl.cc

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ THE SOFTWARE.
2626
#include <sys/time.h>
2727

2828
#include <chrono> //NOLINT
29+
#include <cstddef>
30+
#include <cstdint>
2931
#include <set>
3032
#include <vector>
3133

@@ -86,7 +88,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
8688
}
8789

8890
uint64_t RdcMetricFetcherImpl::now() {
89-
struct timeval tv;
91+
struct timeval tv {};
9092
gettimeofday(&tv, NULL);
9193
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
9294
}
@@ -98,6 +100,7 @@ void RdcMetricFetcherImpl::get_ecc(uint32_t gpu_index, rdc_field_t field_id,
98100

99101
amdsmi_processor_handle processor_handle;
100102
err = get_processor_handle_from_id(gpu_index, &processor_handle);
103+
assert(err == AMDSMI_STATUS_SUCCESS);
101104

102105
// because RDC already had an established order that is different from amd-smi : map blocks to
103106
// fields manually
@@ -521,9 +524,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
521524
case RDC_FI_GPU_CLOCK: {
522525
const uint16_t* clock_array = gpu_metrics.current_gfxclks;
523526
std::vector<uint16_t> valid_clocks;
524-
valid_clocks.reserve(8);
527+
valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS);
525528

526-
for (uint32_t i = 0; i < 8; i++) {
529+
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) {
527530
uint16_t clk = clock_array[i];
528531
if (clk != 0 && clk != 0xFFFF) {
529532
valid_clocks.push_back(clk);
@@ -540,7 +543,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
540543
}
541544

542545
if (vc == num_partitions) {
543-
value->value.l_int = clock_array[info.instance_index] * 1000000;
546+
value->value.l_int = static_cast<int64_t>(clock_array[info.instance_index]) * 1000000;
544547
value->type = INTEGER;
545548
value->status = RDC_ST_OK;
546549
return RDC_ST_OK;
@@ -620,10 +623,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
620623
}
621624

622625
default:
623-
// All other fields => N/A for partition
624-
RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
625-
<< " not supported => NO_DATA.");
626-
return RDC_ST_NO_DATA;
626+
// for now we must let other plugins return valid data for partition metrics
627+
628+
// TODO: All other fields => N/A for partition IN AMDSMI
629+
// RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
630+
// << " not supported => NO_DATA.");
631+
break;
627632
}
628633
} // end if partition
629634

@@ -748,6 +753,17 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
748753
value->value.l_int = static_cast<int64_t>(socket_count);
749754
}
750755
} break;
756+
case RDC_FI_GPU_PARTITION_COUNT: {
757+
uint32_t partition_count = 0;
758+
amdsmi_gpu_metrics_t metrics;
759+
memset(&metrics, 0, sizeof(metrics));
760+
value->status = get_metrics_info(processor_handle, &metrics);
761+
partition_count = metrics.num_partition;
762+
value->type = INTEGER;
763+
if (value->status == AMDSMI_STATUS_SUCCESS) {
764+
value->value.l_int = static_cast<int64_t>(partition_count);
765+
}
766+
} break;
751767
case RDC_FI_POWER_USAGE: {
752768
amdsmi_power_info_t power_info = {};
753769
// Handle API breaking change in amdsmi commit dc4a16da6fb45d581a6e23c78d340172989418a0

rdc_libs/rdc/src/RdcSmiLib.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
186186
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
187187
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED,
188188
RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION,
189-
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID,
189+
RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT,
190190
};
191191
// clang-format on
192192
std::copy(fields.begin(), fields.end(), field_ids);

rdc_libs/rdc/src/RdcWatchTableImpl.cc

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -228,42 +228,6 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
228228
return result;
229229
}
230230

231-
// Check for rocprof fields in partitions
232-
rdc_group_info_t ginfo;
233-
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
234-
if (result != RDC_ST_OK) {
235-
return result;
236-
}
237-
bool groupHasPartition = false;
238-
for (unsigned int i = 0; i < ginfo.count; i++) {
239-
uint32_t entityId = ginfo.entity_ids[i];
240-
rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId);
241-
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
242-
groupHasPartition = true;
243-
break;
244-
}
245-
}
246-
247-
rdc_field_group_info_t field_info;
248-
result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info);
249-
if (result != RDC_ST_OK) {
250-
return result;
251-
}
252-
bool groupHasRocprof = false;
253-
if (result == RDC_ST_OK) {
254-
for (unsigned int i = 0; i < field_info.count; i++) {
255-
rdc_field_t fid = field_info.field_ids[i];
256-
if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's
257-
groupHasRocprof = true;
258-
break;
259-
}
260-
}
261-
}
262-
263-
if (groupHasPartition && groupHasRocprof) {
264-
return RDC_ST_NOT_SUPPORTED;
265-
}
266-
267231
// See if any of the fields are notification fields, and
268232
// set them up, if so.
269233
result = notifications_->set_listen_events(fields_in_watch);

rdc_libs/rdc/src/SmiUtils.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,10 @@ amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) {
240240
return ret;
241241
}
242242

243+
if (num_partition == nullptr) {
244+
return AMDSMI_STATUS_INVAL;
245+
}
246+
243247
amdsmi_gpu_metrics_t metrics;
244248
memset(&metrics, 0, sizeof(metrics));
245249
ret = get_metrics_info(proc_handle, &metrics);

0 commit comments

Comments
 (0)