3737#include <libdrm/amdgpu.h>
3838#include <libdrm/amdgpu_drm.h>
3939#include <math.h>
40+ #include <pthread.h>
4041#include <stdarg.h>
42+ #include <stdatomic.h>
4143#include <stdbool.h>
4244#include <stdint.h>
4345#include <stdio.h>
46+ #include <stdlib.h>
4447#include <string.h>
4548#include <sys/stat.h>
4649#include <sys/syscall.h>
4952#include <uthash.h>
5053#include <xf86drm.h>
5154
55+ extern bool nvtop_debug_amdgpu_metrics ;
56+ extern bool nvtop_enable_pcie_bw_sleep ;
57+
5258// extern
5359const char * amdgpu_parse_marketing_name (struct amdgpu_gpu_info * info );
5460
@@ -120,16 +126,24 @@ struct gpu_info_amdgpu {
120126
121127 // We poll the fan frequently enough and want to avoid the open/close overhead of the sysfs file
122128 FILE * fanSpeedFILE ; // FILE* for this device current fan speed
123- FILE * PCIeBW ; // FILE* for this device PCIe bandwidth over one second
124129 FILE * powerCap ; // FILE* for this device power cap
125130
131+ // gpu_metrics sysfs file descriptor for non-blocking PCIe bandwidth reading
132+ // (replaces pcie_bw which blocks for 1 second per read due to kernel msleep(1000))
133+ int gpuMetricsFD ;
134+ uint64_t last_pcie_bw_acc ; // Previous pcie_bandwidth_acc value for delta computation
135+ bool has_pcie_bw_acc_prev ; // Whether we have a previous accumulated value
136+
126137 nvtop_device * amdgpuDevice ; // The AMDGPU driver device
127138 nvtop_device * hwmonDevice ; // The AMDGPU driver hwmon device
128139
129140 struct amdgpu_process_info_cache * last_update_process_cache , * current_update_process_cache ; // Cached processes info
130141
131142 // Used to compute the actual fan speed
132143 unsigned maxFanValue ;
144+
145+ // Asynchronous PCIe Bandwidth fetching thread (Fallback if gpuMetricsFD < 0 or missing PCIe)
146+ FILE * PCIeBW ; // FILE* for this device PCIe bandwidth over one second
133147};
134148
135149unsigned amdgpu_count ;
@@ -142,6 +156,7 @@ static bool gpuinfo_amdgpu_get_device_handles(struct list_head *devices, unsigne
142156static void gpuinfo_amdgpu_populate_static_info (struct gpu_info * _gpu_info );
143157static void gpuinfo_amdgpu_refresh_dynamic_info (struct gpu_info * _gpu_info );
144158static void gpuinfo_amdgpu_get_running_processes (struct gpu_info * _gpu_info );
159+ static int rewindAndReadPattern (FILE * file , const char * format , ...);
145160
146161struct gpu_vendor gpu_vendor_amdgpu = {
147162 .init = gpuinfo_amdgpu_init ,
@@ -235,8 +250,11 @@ static bool gpuinfo_amdgpu_init(void) {
235250static void gpuinfo_amdgpu_shutdown (void ) {
236251 for (unsigned i = 0 ; i < amdgpu_count ; ++ i ) {
237252 struct gpu_info_amdgpu * gpu_info = & gpu_infos [i ];
253+
238254 if (gpu_info -> fanSpeedFILE )
239255 fclose (gpu_info -> fanSpeedFILE );
256+ if (gpu_info -> gpuMetricsFD >= 0 )
257+ close (gpu_info -> gpuMetricsFD );
240258 if (gpu_info -> PCIeBW )
241259 fclose (gpu_info -> PCIeBW );
242260 if (gpu_info -> powerCap )
@@ -370,11 +388,29 @@ static void initDeviceSysfsPaths(struct gpu_info_amdgpu *gpu_info) {
370388 }
371389
372390 int sysfsFD = open (devicePath , O_RDONLY );
373- // Open the PCIe bandwidth file for dynamic info gathering
391+ // Open the gpu_metrics file for non-blocking PCIe bandwidth reading
392+ // (pcie_bw sysfs blocks for 1 second per read due to kernel msleep(1000))
393+ gpu_info -> gpuMetricsFD = openat (sysfsFD , "gpu_metrics" , O_RDONLY );
394+ gpu_info -> last_pcie_bw_acc = 0 ;
395+ gpu_info -> has_pcie_bw_acc_prev = false;
396+
397+ bool metrics_has_pcie = false;
398+ if (gpu_info -> gpuMetricsFD >= 0 ) {
399+ uint8_t header [4 ];
400+ if (pread (gpu_info -> gpuMetricsFD , header , sizeof (header ), 0 ) == 4 ) {
401+ if (header [2 ] == 1 && header [3 ] >= 4 ) {
402+ metrics_has_pcie = true;
403+ }
404+ }
405+ }
406+
407+ // Open the legacy PCIe bandwidth file for async worker fallback gathering
374408 gpu_info -> PCIeBW = NULL ;
375- int pcieBWFD = openat (sysfsFD , "pcie_bw" , O_RDONLY );
376- if (pcieBWFD ) {
377- gpu_info -> PCIeBW = fdopen (pcieBWFD , "r" );
409+ if (!metrics_has_pcie ) {
410+ int pcieBWFD = openat (sysfsFD , "pcie_bw" , O_RDONLY );
411+ if (pcieBWFD >= 0 ) {
412+ gpu_info -> PCIeBW = fdopen (pcieBWFD , "r" );
413+ }
378414 }
379415
380416 close (sysfsFD );
@@ -466,6 +502,7 @@ static bool gpuinfo_amdgpu_get_device_handles(struct list_head *devices, unsigne
466502 list_add_tail (& gpu_infos [amdgpu_count ].base .list , devices );
467503 // Register a fdinfo callback for this GPU
468504 processinfo_register_fdinfo_callback (parse_drm_fdinfo_amd , & gpu_infos [amdgpu_count ].base );
505+
469506 amdgpu_count ++ ;
470507 } else {
471508 _drmFreeVersion (ver );
@@ -705,11 +742,21 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
705742
706743 // Memory usage
707744 struct drm_amdgpu_memory_info memory_info ;
745+ struct timespec t_query_start , t_query_end ;
746+ if (nvtop_debug_amdgpu_metrics ) {
747+ clock_gettime (CLOCK_MONOTONIC , & t_query_start );
748+ }
708749 if (libdrm_amdgpu_handle && _amdgpu_query_info )
709750 last_libdrm_return_status =
710751 _amdgpu_query_info (gpu_info -> amdgpu_device , AMDGPU_INFO_MEMORY , sizeof (memory_info ), & memory_info );
711752 else
712753 last_libdrm_return_status = 1 ;
754+ if (nvtop_debug_amdgpu_metrics ) {
755+ clock_gettime (CLOCK_MONOTONIC , & t_query_end );
756+ double elapsed_q = (t_query_end .tv_sec - t_query_start .tv_sec ) * 1000.0 +
757+ (t_query_end .tv_nsec - t_query_start .tv_nsec ) / 1000000.0 ;
758+ fprintf (stderr , "[DEBUG] AMD _amdgpu_query_info(AMDGPU_INFO_MEMORY) took %.2f ms\n" , elapsed_q );
759+ }
713760 if (!last_libdrm_return_status ) {
714761 if (gpu_info -> base .static_info .integrated_graphics ) {
715762 SET_GPUINFO_DYNAMIC (dynamic_info , total_memory ,
@@ -739,7 +786,16 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
739786
740787 // Fan speed
741788 unsigned currentFanSpeed ;
789+ if (nvtop_debug_amdgpu_metrics ) {
790+ clock_gettime (CLOCK_MONOTONIC , & t_query_start );
791+ }
742792 int patternsMatched = rewindAndReadPattern (gpu_info -> fanSpeedFILE , "%u" , & currentFanSpeed );
793+ if (nvtop_debug_amdgpu_metrics ) {
794+ clock_gettime (CLOCK_MONOTONIC , & t_query_end );
795+ double elapsed_q = (t_query_end .tv_sec - t_query_start .tv_sec ) * 1000.0 +
796+ (t_query_end .tv_nsec - t_query_start .tv_nsec ) / 1000000.0 ;
797+ fprintf (stderr , "[DEBUG] AMD rewindAndReadPattern(fanSpeedFILE) took %.2f ms\n" , elapsed_q );
798+ }
743799 if (patternsMatched == 1 ) {
744800 SET_GPUINFO_DYNAMIC (dynamic_info , fan_speed , currentFanSpeed * 100 / gpu_info -> maxFanValue );
745801 }
@@ -762,21 +818,93 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
762818 SET_GPUINFO_DYNAMIC (dynamic_info , pcie_link_gen , pcieGen );
763819 }
764820
765- // PCIe bandwidth
766- if (gpu_info -> PCIeBW ) {
767- // According to https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/pm/amdgpu_pm.c, under the pcie_bw
768- // section, we should be able to read the number of packets received and sent by the GPU and get the maximum payload
769- // size during the last second. This is untested but should work when the file is populated by the driver.
821+ // PCIe bandwidth via gpu_metrics (non-blocking, replaces pcie_bw which has a 1-second kernel sleep)
822+ if (gpu_info -> gpuMetricsFD >= 0 ) {
823+ // Read the gpu_metrics binary file from sysfs
824+ // The file starts with a 4-byte header: structure_size(u16), format_revision(u8), content_revision(u8)
825+ // For dGPU metrics v1_4+, pcie_bandwidth_inst is available at a known offset
826+ uint8_t metrics_buf [256 ]; // Large enough for the header + PCIe bandwidth fields
827+ ssize_t nread = pread (gpu_info -> gpuMetricsFD , metrics_buf , sizeof (metrics_buf ), 0 );
828+ if (nread >= 4 ) {
829+ uint16_t structure_size ;
830+ memcpy (& structure_size , metrics_buf , sizeof (structure_size ));
831+ uint8_t format_revision = metrics_buf [2 ];
832+ uint8_t content_revision = metrics_buf [3 ];
833+
834+ // gpu_metrics v1_4+ (dGPU) has pcie_bandwidth_acc and pcie_bandwidth_inst
835+ // format_revision == 1 means dGPU metrics, content_revision >= 4 means v1_4+
836+ if (format_revision == 1 && content_revision >= 4 && nread >= (ssize_t )structure_size ) {
837+ // In gpu_metrics_v1_4, the layout after the header has pcie_bandwidth_acc and pcie_bandwidth_inst
838+ // as uint64_t fields. We use pcie_bandwidth_inst (instantaneous bandwidth in GB/sec)
839+ // and split evenly as an approximation for RX/TX since the kernel doesn't separate them.
840+ //
841+ // Field offsets within gpu_metrics_v1_4 (after the 4-byte header):
842+ // The pcie_bandwidth_inst field follows pcie_bandwidth_acc.
843+ // We scan from the structure definition to find pcie_bandwidth_acc offset.
844+ //
845+ // Offset calculation for gpu_metrics_v1_4:
846+ // header(4) + temp_hotspot(2) + temp_mem(2) + temp_vrsoc(2) = 10
847+ // curr_socket_power(2) = 12
848+ // avg_gfx_activity(2) + avg_umc_activity(2) + vcn_activity[4](8) = 24
849+ // energy_accumulator(8) = 32
850+ // system_clock_counter(8) = 40
851+ // throttle_status(4) = 44
852+ // gfxclk_lock_status(4) = 48
853+ // pcie_link_width(2) + pcie_link_speed(2) = 52
854+ // xgmi_link_width(2) + xgmi_link_speed(2) = 56
855+ // gfx_activity_acc(4) + mem_activity_acc(4) = 64
856+ // pcie_bandwidth_acc(8) = offset 64, ends at 72
857+ // pcie_bandwidth_inst(8) = offset 72, ends at 80
858+ // const size_t pcie_bw_acc_offset = 64;
859+ const size_t pcie_bw_inst_offset = 72 ;
860+ if (nread >= (ssize_t )(pcie_bw_inst_offset + sizeof (uint64_t ))) {
861+ uint64_t pcie_bw_inst ;
862+ memcpy (& pcie_bw_inst , metrics_buf + pcie_bw_inst_offset , sizeof (pcie_bw_inst ));
863+
864+ // In gpu_metrics, if a sensor is unsupported, it often reports 0xFFFFFFFFFFFFFFFF (UINT64_MAX)
865+ if (pcie_bw_inst != UINT64_MAX ) {
866+ // pcie_bandwidth_inst is in GB/sec, convert to KiB/sec
867+ // Split evenly between RX and TX as a best approximation
868+ uint64_t total_kib = pcie_bw_inst * 1024 * 1024 ; // GB/sec -> KiB/sec
869+ SET_GPUINFO_DYNAMIC (dynamic_info , pcie_rx , total_kib / 2 );
870+ SET_GPUINFO_DYNAMIC (dynamic_info , pcie_tx , total_kib / 2 );
871+ }
872+ }
873+ }
874+
875+ if (nvtop_debug_amdgpu_metrics ) {
876+ fprintf (stderr , "[DEBUG] AMD gpu_metrics read %zd bytes: format_revision=%u, content_revision=%u\n" , nread ,
877+ format_revision , content_revision );
878+ fprintf (stderr , "[DEBUG] Raw gpu_metrics hex dump:\n" );
879+ for (ssize_t i = 0 ; i < nread ; i ++ ) {
880+ fprintf (stderr , "%02x " , metrics_buf [i ]);
881+ if ((i + 1 ) % 16 == 0 )
882+ fprintf (stderr , "\n" );
883+ }
884+ fprintf (stderr , "\n" );
885+ }
886+ }
887+ } else if (gpu_info -> PCIeBW && nvtop_enable_pcie_bw_sleep ) {
770888 uint64_t received , transmitted ;
771889 int maxPayloadSize ;
890+ if (nvtop_debug_amdgpu_metrics ) {
891+ clock_gettime (CLOCK_MONOTONIC , & t_query_start );
892+ }
772893 int NreadPatterns =
773894 rewindAndReadPattern (gpu_info -> PCIeBW , "%" SCNu64 " %" SCNu64 " %i" , & received , & transmitted , & maxPayloadSize );
895+ if (nvtop_debug_amdgpu_metrics ) {
896+ clock_gettime (CLOCK_MONOTONIC , & t_query_end );
897+ double elapsed_q = (t_query_end .tv_sec - t_query_start .tv_sec ) * 1000.0 +
898+ (t_query_end .tv_nsec - t_query_start .tv_nsec ) / 1000000.0 ;
899+ fprintf (stderr , "[DEBUG] AMD pcie_bw inline read took %.2f ms. Matches: %d\n" , elapsed_q , NreadPatterns );
900+ }
774901 if (NreadPatterns == 3 ) {
775902 received *= maxPayloadSize ;
776903 transmitted *= maxPayloadSize ;
777- // Set in KiB
904+ // Store in KiB
778905 received /= 1024 ;
779906 transmitted /= 1024 ;
907+
780908 SET_GPUINFO_DYNAMIC (dynamic_info , pcie_rx , received );
781909 SET_GPUINFO_DYNAMIC (dynamic_info , pcie_tx , transmitted );
782910 }
0 commit comments