diff --git a/.gitignore b/.gitignore
index df7dff2a..0e8d46f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@ build/
 cmake-build*/
 .vscode
 .idea
+.cache
+
diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index 9e4d1c9d..f3bc5458 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -61,6 +61,7 @@ enum gpuinfo_static_info_valid {
   gpuinfo_l2cache_size_valid,
   gpuinfo_n_exec_engines_valid,
   gpuinfo_engine_count_valid,
+  gpuinfo_fan_rpm_max_valid,
   gpuinfo_static_info_count,
 };
 
@@ -76,6 +77,7 @@ struct gpuinfo_static_info {
   unsigned l2cache_size;
   unsigned n_exec_engines;
   unsigned engine_count;
+  unsigned fan_rpm_max;
   bool integrated_graphics;
   bool encode_decode_shared;
   unsigned char valid[(gpuinfo_static_info_count + CHAR_BIT - 1) / CHAR_BIT];
@@ -132,7 +134,7 @@ struct gpuinfo_dynamic_info {
   unsigned int gpu_temp;            // GPU temperature °celsius
   unsigned int power_draw;          // Power usage in milliwatts
   unsigned int power_draw_max;      // Max power usage in milliwatts
-  bool multi_instance_mode;          // True if the GPU is in multi-instance mode
+  bool multi_instance_mode;         // True if the GPU is in multi-instance mode
   unsigned char valid[(gpuinfo_dynamic_info_count + CHAR_BIT - 1) / CHAR_BIT];
 };
 
diff --git a/src/extract_gpuinfo.c b/src/extract_gpuinfo.c
index 896b78fb..a8752847 100644
--- a/src/extract_gpuinfo.c
+++ b/src/extract_gpuinfo.c
@@ -134,45 +134,69 @@ bool gpuinfo_fix_dynamic_info_from_process_info(struct list_head *devices) {
     unsigned reportedGpuRate = dynamic_info->gpu_util_rate;
     RESET_GPUINFO_DYNAMIC(dynamic_info, gpu_util_rate);
 
-    // AMDGPU does not provide encode and decode utilization through the DRM sensor info.
-    // Update them here since per-process sysfs exposes this information.
-    bool needGpuEncode = !GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, encoder_rate);
-    bool needGpuDecode = !GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, decoder_rate);
+    // For encode/decode, save vendor-reported values and reset, then re-aggregate
+    // from per-process info (same pattern as gpu_util_rate). This ensures GPUs that
+    // set a baseline (e.g., AMDGPU sets 0) show 0% when idle, while GPUs that never
+    // set encoder_rate/decoder_rate still show null.
+    bool validReportedEncoderRate = GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, encoder_rate);
+    unsigned reportedEncoderRate = dynamic_info->encoder_rate;
+    if (validReportedEncoderRate)
+      RESET_GPUINFO_DYNAMIC(dynamic_info, encoder_rate);
+
+    bool validReportedDecoderRate = GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, decoder_rate);
+    unsigned reportedDecoderRate = dynamic_info->decoder_rate;
+    if (validReportedDecoderRate)
+      RESET_GPUINFO_DYNAMIC(dynamic_info, decoder_rate);
+
     bool needGPUMemory = !GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, used_memory) &&
                          GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, total_memory);
-    if (needGpuRate || needGpuEncode || needGpuDecode || needGPUMemory) {
-      for (unsigned processIdx = 0; processIdx < device->processes_count; ++processIdx) {
-        struct gpu_process *process_info = &device->processes[processIdx];
-        if (needGpuRate && GPUINFO_PROCESS_FIELD_VALID(process_info, gpu_usage)) {
-          if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, gpu_util_rate)) {
-            dynamic_info->gpu_util_rate = MYMIN(100, dynamic_info->gpu_util_rate + process_info->gpu_usage);
-          } else {
-            SET_GPUINFO_DYNAMIC(dynamic_info, gpu_util_rate, MYMIN(100, process_info->gpu_usage));
-          }
+    for (unsigned processIdx = 0; processIdx < device->processes_count; ++processIdx) {
+      struct gpu_process *process_info = &device->processes[processIdx];
+      if (needGpuRate && GPUINFO_PROCESS_FIELD_VALID(process_info, gpu_usage)) {
+        if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, gpu_util_rate)) {
+          dynamic_info->gpu_util_rate = MYMIN(100, dynamic_info->gpu_util_rate + process_info->gpu_usage);
+        } else {
+          SET_GPUINFO_DYNAMIC(dynamic_info, gpu_util_rate, MYMIN(100, process_info->gpu_usage));
         }
-        if (needGpuEncode && GPUINFO_PROCESS_FIELD_VALID(process_info, encode_usage)) {
-          if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, encoder_rate)) {
-            dynamic_info->encoder_rate = MYMIN(100, dynamic_info->encoder_rate + process_info->encode_usage);
-          } else {
-            SET_GPUINFO_DYNAMIC(dynamic_info, encoder_rate, MYMIN(100, process_info->encode_usage));
-          }
+      }
+      if (GPUINFO_PROCESS_FIELD_VALID(process_info, encode_usage)) {
+        if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, encoder_rate)) {
+          dynamic_info->encoder_rate = MYMIN(100, dynamic_info->encoder_rate + process_info->encode_usage);
+        } else {
+          SET_GPUINFO_DYNAMIC(dynamic_info, encoder_rate, MYMIN(100, process_info->encode_usage));
         }
-        if (needGpuDecode && GPUINFO_PROCESS_FIELD_VALID(process_info, decode_usage)) {
-          if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, decoder_rate)) {
-            dynamic_info->decoder_rate = MYMIN(100, dynamic_info->decoder_rate + process_info->decode_usage);
-          } else {
-            SET_GPUINFO_DYNAMIC(dynamic_info, decoder_rate, MYMIN(100, process_info->decode_usage));
-          }
+      }
+      if (GPUINFO_PROCESS_FIELD_VALID(process_info, decode_usage)) {
+        if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, decoder_rate)) {
+          dynamic_info->decoder_rate = MYMIN(100, dynamic_info->decoder_rate + process_info->decode_usage);
+        } else {
+          SET_GPUINFO_DYNAMIC(dynamic_info, decoder_rate, MYMIN(100, process_info->decode_usage));
         }
-        if (needGPUMemory && GPUINFO_PROCESS_FIELD_VALID(process_info, gpu_memory_usage)) {
-          if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, used_memory)) {
-            dynamic_info->used_memory += dynamic_info->used_memory + process_info->gpu_memory_usage;
-          } else {
-            SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, process_info->gpu_memory_usage);
-          }
+      }
+      if (needGPUMemory && GPUINFO_PROCESS_FIELD_VALID(process_info, gpu_memory_usage)) {
+        if (GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, used_memory)) {
+          dynamic_info->used_memory += dynamic_info->used_memory + process_info->gpu_memory_usage;
+        } else {
+          SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, process_info->gpu_memory_usage);
         }
       }
     }
+    // Restore vendor-reported encoder/decoder rate if process aggregation didn't produce a value,
+    // or use the max of vendor-reported and process-aggregated
+    if (validReportedEncoderRate) {
+      if (!GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, encoder_rate)) {
+        SET_GPUINFO_DYNAMIC(dynamic_info, encoder_rate, reportedEncoderRate);
+      } else if (dynamic_info->encoder_rate < reportedEncoderRate) {
+        dynamic_info->encoder_rate = reportedEncoderRate;
+      }
+    }
+    if (validReportedDecoderRate) {
+      if (!GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, decoder_rate)) {
+        SET_GPUINFO_DYNAMIC(dynamic_info, decoder_rate, reportedDecoderRate);
+      } else if (dynamic_info->decoder_rate < reportedDecoderRate) {
+        dynamic_info->decoder_rate = reportedDecoderRate;
+      }
+    }
     // Sanitize what we got from processes: we can't have more than the total!
     if (needGPUMemory && GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, used_memory) &&
         GPUINFO_DYNAMIC_FIELD_VALID(dynamic_info, total_memory) &&
@@ -394,13 +418,14 @@ void gpuinfo_refresh_utilisation_rate(struct gpu_info *gpu_info) {
     return;
 
   if (IS_VALID(gpuinfo_engine_count_valid, gpu_info->static_info.valid))
-          ec = gpu_info->static_info.engine_count;
+    ec = gpu_info->static_info.engine_count;
   else
-          ec = 1;
+    ec = 1;
 
   avg_delta_secs = ((double)total_delta / gpu_info->processes_count) / 1000000000.0;
   max_freq_hz = gpu_info->dynamic_info.gpu_clock_speed_max * 1000000;
-  utilisation_rate = (unsigned int)((((double)gfx_total_process_cycles) / (((double)max_freq_hz) * avg_delta_secs * ec)) * 100);
+  utilisation_rate =
+      (unsigned int)((((double)gfx_total_process_cycles) / (((double)max_freq_hz) * avg_delta_secs * ec)) * 100);
   utilisation_rate = utilisation_rate > 100 ? 100 : utilisation_rate;
 
   SET_GPUINFO_DYNAMIC(&gpu_info->dynamic_info, gpu_util_rate, utilisation_rate);
diff --git a/src/extract_gpuinfo_amdgpu.c b/src/extract_gpuinfo_amdgpu.c
index f64c353c..cb4331f0 100644
--- a/src/extract_gpuinfo_amdgpu.c
+++ b/src/extract_gpuinfo_amdgpu.c
@@ -37,10 +37,13 @@
 #include <libdrm/amdgpu.h>
 #include <libdrm/amdgpu_drm.h>
 #include <math.h>
+#include <pthread.h>
 #include <stdarg.h>
+#include <stdatomic.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
@@ -49,6 +52,9 @@
 #include <uthash.h>
 #include <xf86drm.h>
 
+extern bool nvtop_debug_amdgpu_metrics;
+extern bool nvtop_enable_pcie_bw_sleep;
+
 // extern
 const char *amdgpu_parse_marketing_name(struct amdgpu_gpu_info *info);
 
@@ -120,9 +126,15 @@ struct gpu_info_amdgpu {
 
   // We poll the fan frequently enough and want to avoid the open/close overhead of the sysfs file
   FILE *fanSpeedFILE; // FILE* for this device current fan speed
-  FILE *PCIeBW;       // FILE* for this device PCIe bandwidth over one second
+  FILE *fanRPMFILE;   // FILE* for raw RPM reading (always fan1_input)
   FILE *powerCap;     // FILE* for this device power cap
 
+  // gpu_metrics sysfs file descriptor for non-blocking PCIe bandwidth reading
+  // (replaces pcie_bw which blocks for 1 second per read due to kernel msleep(1000))
+  int gpuMetricsFD;
+  uint64_t last_pcie_bw_acc; // Previous pcie_bandwidth_acc value for delta computation
+  bool has_pcie_bw_acc_prev; // Whether we have a previous accumulated value
+
   nvtop_device *amdgpuDevice; // The AMDGPU driver device
   nvtop_device *hwmonDevice;  // The AMDGPU driver hwmon device
 
@@ -130,6 +142,9 @@ struct gpu_info_amdgpu {
 
   // Used to compute the actual fan speed
   unsigned maxFanValue;
+
+  // Asynchronous PCIe Bandwidth fetching thread (Fallback if gpuMetricsFD < 0 or missing PCIe)
+  FILE *PCIeBW; // FILE* for this device PCIe bandwidth over one second
 };
 
 unsigned amdgpu_count;
@@ -142,6 +157,7 @@ static bool gpuinfo_amdgpu_get_device_handles(struct list_head *devices, unsigne
 static void gpuinfo_amdgpu_populate_static_info(struct gpu_info *_gpu_info);
 static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info);
 static void gpuinfo_amdgpu_get_running_processes(struct gpu_info *_gpu_info);
+static int rewindAndReadPattern(FILE *file, const char *format, ...);
 
 struct gpu_vendor gpu_vendor_amdgpu = {
     .init = gpuinfo_amdgpu_init,
@@ -235,8 +251,14 @@ static bool gpuinfo_amdgpu_init(void) {
 static void gpuinfo_amdgpu_shutdown(void) {
   for (unsigned i = 0; i < amdgpu_count; ++i) {
     struct gpu_info_amdgpu *gpu_info = &gpu_infos[i];
+
     if (gpu_info->fanSpeedFILE)
       fclose(gpu_info->fanSpeedFILE);
+    // Only close if it's a separate file handle (not shared with fanSpeedFILE)
+    if (gpu_info->fanRPMFILE && gpu_info->fanRPMFILE != gpu_info->fanSpeedFILE)
+      fclose(gpu_info->fanRPMFILE);
+    if (gpu_info->gpuMetricsFD >= 0)
+      close(gpu_info->gpuMetricsFD);
     if (gpu_info->PCIeBW)
       fclose(gpu_info->PCIeBW);
     if (gpu_info->powerCap)
@@ -332,15 +354,32 @@ static void initDeviceSysfsPaths(struct gpu_info_amdgpu *gpu_info) {
 
     // Look for which fan to use (PWM or RPM)
     gpu_info->fanSpeedFILE = NULL;
+    gpu_info->fanRPMFILE = NULL;
     unsigned pwmIsEnabled;
     int NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "pwm1_enable", "%u", &pwmIsEnabled);
     bool usePWMSensor = NreadPatterns == 1 && pwmIsEnabled > 0;
 
     bool useRPMSensor = false;
-    if (!usePWMSensor) {
+    // When pwm1_enable=2 (automatic), the driver/firmware controls the fan
+    // directly and pwm1 often reads 0 regardless of actual speed.
+    // Prefer fan1_input (RPM tachometer) which always reflects reality.
+    if (usePWMSensor && pwmIsEnabled == 2) {
+      // Check if RPM tachometer is available and readable
+      unsigned rpmCheck;
+      NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "fan1_input", "%u", &rpmCheck);
+      if (NreadPatterns == 1) {
+        unsigned rpmMax;
+        NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "fan1_max", "%u", &rpmMax);
+        if (NreadPatterns == 1 && rpmMax > 0) {
+          usePWMSensor = false;
+          useRPMSensor = true;
+        }
+      }
+    }
+    if (!usePWMSensor && !useRPMSensor) {
       unsigned rpmIsEnabled;
       NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "fan1_enable", "%u", &rpmIsEnabled);
-      useRPMSensor = NreadPatterns && rpmIsEnabled > 0;
+      useRPMSensor = NreadPatterns == 1 && rpmIsEnabled > 0;
     }
     // Either RPM or PWM or neither
     assert((useRPMSensor ^ usePWMSensor) || (!useRPMSensor && !usePWMSensor));
@@ -360,6 +399,21 @@ static void initDeviceSysfsPaths(struct gpu_info_amdgpu *gpu_info) {
         }
       }
     }
+
+    // Always try to open fan1_input for raw RPM display, independent of
+    // which sensor is used for percentage calculation.
+    // If fanSpeedFILE already reads fan1_input (useRPMSensor), reuse it.
+    if (useRPMSensor && gpu_info->fanSpeedFILE) {
+      gpu_info->fanRPMFILE = gpu_info->fanSpeedFILE;
+    } else {
+      int fanRPMFD = openat(hwmonFD, "fan1_input", O_RDONLY);
+      if (fanRPMFD >= 0) {
+        gpu_info->fanRPMFILE = fdopen(fanRPMFD, "r");
+        if (!gpu_info->fanRPMFILE)
+          close(fanRPMFD);
+      }
+    }
+
     // Open the power cap file for dynamic info gathering
     gpu_info->powerCap = NULL;
     int powerCapFD = openat(hwmonFD, "power1_cap", O_RDONLY);
@@ -370,11 +424,29 @@ static void initDeviceSysfsPaths(struct gpu_info_amdgpu *gpu_info) {
   }
 
   int sysfsFD = open(devicePath, O_RDONLY);
-  // Open the PCIe bandwidth file for dynamic info gathering
+  // Open the gpu_metrics file for non-blocking PCIe bandwidth reading
+  // (pcie_bw sysfs blocks for 1 second per read due to kernel msleep(1000))
+  gpu_info->gpuMetricsFD = openat(sysfsFD, "gpu_metrics", O_RDONLY);
+  gpu_info->last_pcie_bw_acc = 0;
+  gpu_info->has_pcie_bw_acc_prev = false;
+
+  bool metrics_has_pcie = false;
+  if (gpu_info->gpuMetricsFD >= 0) {
+    uint8_t header[4];
+    if (pread(gpu_info->gpuMetricsFD, header, sizeof(header), 0) == 4) {
+      if (header[2] == 1 && header[3] >= 4) {
+        metrics_has_pcie = true;
+      }
+    }
+  }
+
+  // Open the legacy PCIe bandwidth file for async worker fallback gathering
   gpu_info->PCIeBW = NULL;
-  int pcieBWFD = openat(sysfsFD, "pcie_bw", O_RDONLY);
-  if (pcieBWFD) {
-    gpu_info->PCIeBW = fdopen(pcieBWFD, "r");
+  if (!metrics_has_pcie) {
+    int pcieBWFD = openat(sysfsFD, "pcie_bw", O_RDONLY);
+    if (pcieBWFD >= 0) {
+      gpu_info->PCIeBW = fdopen(pcieBWFD, "r");
+    }
   }
 
   close(sysfsFD);
@@ -466,6 +538,7 @@ static bool gpuinfo_amdgpu_get_device_handles(struct list_head *devices, unsigne
       list_add_tail(&gpu_infos[amdgpu_count].base.list, devices);
       // Register a fdinfo callback for this GPU
       processinfo_register_fdinfo_callback(parse_drm_fdinfo_amd, &gpu_infos[amdgpu_count].base);
+
       amdgpu_count++;
     } else {
       _drmFreeVersion(ver);
@@ -627,6 +700,13 @@ static void gpuinfo_amdgpu_populate_static_info(struct gpu_info *_gpu_info) {
     if (nReadPatterns == 1) {
       SET_GPUINFO_STATIC(static_info, temperature_shutdown_threshold, emergencyTemp);
     }
+
+    // Fan RPM max (for UI display alongside percentage)
+    unsigned fanRPMMax;
+    nReadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "fan1_max", "%u", &fanRPMMax);
+    if (nReadPatterns == 1 && fanRPMMax > 0) {
+      SET_GPUINFO_STATIC(static_info, fan_rpm_max, fanRPMMax);
+    }
   }
 
   nvtop_pcie_link max_link_characteristics;
@@ -705,16 +785,29 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
 
   // Memory usage
   struct drm_amdgpu_memory_info memory_info;
+  struct timespec t_query_start, t_query_end;
+  if (nvtop_debug_amdgpu_metrics) {
+    clock_gettime(CLOCK_MONOTONIC, &t_query_start);
+  }
   if (libdrm_amdgpu_handle && _amdgpu_query_info)
     last_libdrm_return_status =
         _amdgpu_query_info(gpu_info->amdgpu_device, AMDGPU_INFO_MEMORY, sizeof(memory_info), &memory_info);
   else
     last_libdrm_return_status = 1;
+  if (nvtop_debug_amdgpu_metrics) {
+    clock_gettime(CLOCK_MONOTONIC, &t_query_end);
+    double elapsed_q = (t_query_end.tv_sec - t_query_start.tv_sec) * 1000.0 +
+                       (t_query_end.tv_nsec - t_query_start.tv_nsec) / 1000000.0;
+    fprintf(stderr, "[DEBUG] AMD _amdgpu_query_info(AMDGPU_INFO_MEMORY) took %.2f ms\n", elapsed_q);
+  }
   if (!last_libdrm_return_status) {
     if (gpu_info->base.static_info.integrated_graphics) {
-      SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.vram.total_heap_size + memory_info.gtt.total_heap_size);
+      SET_GPUINFO_DYNAMIC(dynamic_info, total_memory,
+                          memory_info.vram.total_heap_size + memory_info.gtt.total_heap_size);
       SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.vram.heap_usage + memory_info.gtt.heap_usage);
-      SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.vram.total_heap_size + memory_info.gtt.total_heap_size - dynamic_info->used_memory);
+      SET_GPUINFO_DYNAMIC(dynamic_info, free_memory,
+                          memory_info.vram.total_heap_size + memory_info.gtt.total_heap_size -
+                              dynamic_info->used_memory);
     } else {
       SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.vram.total_heap_size);
       SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.vram.heap_usage);
@@ -736,11 +829,37 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
 
   // Fan speed
   unsigned currentFanSpeed;
+  if (nvtop_debug_amdgpu_metrics) {
+    clock_gettime(CLOCK_MONOTONIC, &t_query_start);
+  }
   int patternsMatched = rewindAndReadPattern(gpu_info->fanSpeedFILE, "%u", &currentFanSpeed);
+  if (nvtop_debug_amdgpu_metrics) {
+    clock_gettime(CLOCK_MONOTONIC, &t_query_end);
+    double elapsed_q = (t_query_end.tv_sec - t_query_start.tv_sec) * 1000.0 +
+                       (t_query_end.tv_nsec - t_query_start.tv_nsec) / 1000000.0;
+    fprintf(stderr, "[DEBUG] AMD rewindAndReadPattern(fanSpeedFILE) took %.2f ms\n", elapsed_q);
+  }
   if (patternsMatched == 1) {
     SET_GPUINFO_DYNAMIC(dynamic_info, fan_speed, currentFanSpeed * 100 / gpu_info->maxFanValue);
   }
 
+  // Fan RPM (raw tachometer reading)
+  if (gpu_info->fanRPMFILE) {
+    unsigned currentRPM;
+    // If fanRPMFILE is the same as fanSpeedFILE (RPM sensor used for both),
+    // reuse the value we already read instead of reading the file again.
+    if (gpu_info->fanRPMFILE == gpu_info->fanSpeedFILE) {
+      if (patternsMatched == 1) {
+        SET_GPUINFO_DYNAMIC(dynamic_info, fan_rpm, currentFanSpeed);
+      }
+    } else {
+      patternsMatched = rewindAndReadPattern(gpu_info->fanRPMFILE, "%u", &currentRPM);
+      if (patternsMatched == 1) {
+        SET_GPUINFO_DYNAMIC(dynamic_info, fan_rpm, currentRPM);
+      }
+    }
+  }
+
   // Device power usage
   if (libdrm_amdgpu_handle && _amdgpu_query_sensor_info)
     last_libdrm_return_status =
@@ -759,21 +878,93 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
     SET_GPUINFO_DYNAMIC(dynamic_info, pcie_link_gen, pcieGen);
   }
 
-  // PCIe bandwidth
-  if (gpu_info->PCIeBW) {
-    // According to https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/pm/amdgpu_pm.c, under the pcie_bw
-    // section, we should be able to read the number of packets received and sent by the GPU and get the maximum payload
-    // size during the last second. This is untested but should work when the file is populated by the driver.
+  // PCIe bandwidth via gpu_metrics (non-blocking, replaces pcie_bw which has a 1-second kernel sleep)
+  if (gpu_info->gpuMetricsFD >= 0) {
+    // Read the gpu_metrics binary file from sysfs
+    // The file starts with a 4-byte header: structure_size(u16), format_revision(u8), content_revision(u8)
+    // For dGPU metrics v1_4+, pcie_bandwidth_inst is available at a known offset
+    uint8_t metrics_buf[256]; // Large enough for the header + PCIe bandwidth fields
+    ssize_t nread = pread(gpu_info->gpuMetricsFD, metrics_buf, sizeof(metrics_buf), 0);
+    if (nread >= 4) {
+      uint16_t structure_size;
+      memcpy(&structure_size, metrics_buf, sizeof(structure_size));
+      uint8_t format_revision = metrics_buf[2];
+      uint8_t content_revision = metrics_buf[3];
+
+      // gpu_metrics v1_4+ (dGPU) has pcie_bandwidth_acc and pcie_bandwidth_inst
+      // format_revision == 1 means dGPU metrics, content_revision >= 4 means v1_4+
+      if (format_revision == 1 && content_revision >= 4 && nread >= (ssize_t)structure_size) {
+        // In gpu_metrics_v1_4, the layout after the header has pcie_bandwidth_acc and pcie_bandwidth_inst
+        // as uint64_t fields. We use pcie_bandwidth_inst (instantaneous bandwidth in GB/sec)
+        // and split evenly as an approximation for RX/TX since the kernel doesn't separate them.
+        //
+        // Field offsets within gpu_metrics_v1_4 (after the 4-byte header):
+        //   The pcie_bandwidth_inst field follows pcie_bandwidth_acc.
+        //   We scan from the structure definition to find pcie_bandwidth_acc offset.
+        //
+        // Offset calculation for gpu_metrics_v1_4:
+        //   header(4) + temp_hotspot(2) + temp_mem(2) + temp_vrsoc(2) = 10
+        //   curr_socket_power(2) = 12
+        //   avg_gfx_activity(2) + avg_umc_activity(2) + vcn_activity[4](8) = 24
+        //   energy_accumulator(8) = 32
+        //   system_clock_counter(8) = 40
+        //   throttle_status(4) = 44
+        //   gfxclk_lock_status(4) = 48
+        //   pcie_link_width(2) + pcie_link_speed(2) = 52
+        //   xgmi_link_width(2) + xgmi_link_speed(2) = 56
+        //   gfx_activity_acc(4) + mem_activity_acc(4) = 64
+        //   pcie_bandwidth_acc(8) = offset 64, ends at 72
+        //   pcie_bandwidth_inst(8) = offset 72, ends at 80
+        // const size_t pcie_bw_acc_offset = 64;
+        const size_t pcie_bw_inst_offset = 72;
+        if (nread >= (ssize_t)(pcie_bw_inst_offset + sizeof(uint64_t))) {
+          uint64_t pcie_bw_inst;
+          memcpy(&pcie_bw_inst, metrics_buf + pcie_bw_inst_offset, sizeof(pcie_bw_inst));
+
+          // In gpu_metrics, if a sensor is unsupported, it often reports 0xFFFFFFFFFFFFFFFF (UINT64_MAX)
+          if (pcie_bw_inst != UINT64_MAX) {
+            // pcie_bandwidth_inst is in GB/sec, convert to KiB/sec
+            // Split evenly between RX and TX as a best approximation
+            uint64_t total_kib = pcie_bw_inst * 1024 * 1024; // GB/sec -> KiB/sec
+            SET_GPUINFO_DYNAMIC(dynamic_info, pcie_rx, total_kib / 2);
+            SET_GPUINFO_DYNAMIC(dynamic_info, pcie_tx, total_kib / 2);
+          }
+        }
+      }
+
+      if (nvtop_debug_amdgpu_metrics) {
+        fprintf(stderr, "[DEBUG] AMD gpu_metrics read %zd bytes: format_revision=%u, content_revision=%u\n", nread,
+                format_revision, content_revision);
+        fprintf(stderr, "[DEBUG] Raw gpu_metrics hex dump:\n");
+        for (ssize_t i = 0; i < nread; i++) {
+          fprintf(stderr, "%02x ", metrics_buf[i]);
+          if ((i + 1) % 16 == 0)
+            fprintf(stderr, "\n");
+        }
+        fprintf(stderr, "\n");
+      }
+    }
+  } else if (gpu_info->PCIeBW && nvtop_enable_pcie_bw_sleep) {
     uint64_t received, transmitted;
     int maxPayloadSize;
+    if (nvtop_debug_amdgpu_metrics) {
+      clock_gettime(CLOCK_MONOTONIC, &t_query_start);
+    }
     int NreadPatterns =
         rewindAndReadPattern(gpu_info->PCIeBW, "%" SCNu64 " %" SCNu64 " %i", &received, &transmitted, &maxPayloadSize);
+    if (nvtop_debug_amdgpu_metrics) {
+      clock_gettime(CLOCK_MONOTONIC, &t_query_end);
+      double elapsed_q = (t_query_end.tv_sec - t_query_start.tv_sec) * 1000.0 +
+                         (t_query_end.tv_nsec - t_query_start.tv_nsec) / 1000000.0;
+      fprintf(stderr, "[DEBUG] AMD pcie_bw inline read took %.2f ms. Matches: %d\n", elapsed_q, NreadPatterns);
+    }
     if (NreadPatterns == 3) {
       received *= maxPayloadSize;
       transmitted *= maxPayloadSize;
-      // Set in KiB
+      // Store in KiB
       received /= 1024;
       transmitted /= 1024;
+
       SET_GPUINFO_DYNAMIC(dynamic_info, pcie_rx, received);
       SET_GPUINFO_DYNAMIC(dynamic_info, pcie_tx, transmitted);
     }
@@ -787,6 +978,12 @@ static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
       SET_GPUINFO_DYNAMIC(dynamic_info, power_draw_max, powerCap / 1000);
     }
   }
+
+  // AMDGPU does not expose encode/decode utilization through DRM sensor queries.
+  // Set baseline to 0; actual per-process usage will be aggregated in
+  // gpuinfo_fix_dynamic_info_from_process_info.
+  SET_GPUINFO_DYNAMIC(dynamic_info, encoder_rate, 0);
+  SET_GPUINFO_DYNAMIC(dynamic_info, decoder_rate, 0);
 }
 
 static const char drm_amdgpu_pdev_old[] = "pdev";
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 25d22809..02cac222 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -29,6 +29,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 
 #define NVML_SUCCESS 0
 #define NVML_ERROR_NOT_SUPPORTED 3
@@ -170,8 +171,8 @@ typedef struct {
   unsigned long long usedGpuMemory;
   unsigned int gpuInstanceId;
   unsigned int computeInstanceId;
-  // This is present in https://github.com/NVIDIA/DCGM/blob/master/sdk/nvidia/nvml/nvml.h#L294 but not the latest driver nvml.h
-  // unsigned long long usedGpuCcProtectedMemory;
+  // This is present in https://github.com/NVIDIA/DCGM/blob/master/sdk/nvidia/nvml/nvml.h#L294 but not the latest driver
+  // nvml.h unsigned long long usedGpuCcProtectedMemory;
 } nvmlProcessInfo_v3_t;
 
 static nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v1)(nvmlDevice_t device, unsigned int *infoCount,
@@ -394,11 +395,11 @@ static bool gpuinfo_nvidia_init(void) {
     goto init_error_clean_exit;
 
   nvmlDeviceGetGraphicsRunningProcesses[1] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v1;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v1;
   nvmlDeviceGetGraphicsRunningProcesses[2] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v2;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v2;
   nvmlDeviceGetGraphicsRunningProcesses[3] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v3;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v3;
 
   nvmlDeviceGetComputeRunningProcesses_v3 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetComputeRunningProcesses_v3");
   nvmlDeviceGetComputeRunningProcesses_v2 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetComputeRunningProcesses_v2");
@@ -408,11 +409,11 @@ static bool gpuinfo_nvidia_init(void) {
     goto init_error_clean_exit;
 
   nvmlDeviceGetComputeRunningProcesses[1] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v1;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v1;
   nvmlDeviceGetComputeRunningProcesses[2] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v2;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v2;
   nvmlDeviceGetComputeRunningProcesses[3] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v3;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v3;
 
   // These functions were not available in older NVML libs; don't error if not present
   nvmlDeviceGetMPSComputeRunningProcesses_v3 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMPSComputeRunningProcesses_v3");
@@ -420,11 +421,11 @@ static bool gpuinfo_nvidia_init(void) {
   nvmlDeviceGetMPSComputeRunningProcesses_v1 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMPSComputeRunningProcesses");
 
   nvmlDeviceGetMPSComputeRunningProcesses[1] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v1;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v1;
   nvmlDeviceGetMPSComputeRunningProcesses[2] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v2;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v2;
   nvmlDeviceGetMPSComputeRunningProcesses[3] =
-      (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v3;
+      (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v3;
 
   // These ones might not be available
   nvmlDeviceGetProcessUtilization = dlsym(libnvidia_ml_handle, "nvmlDeviceGetProcessUtilization");
@@ -882,30 +883,44 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
         }
       }
       memset(_gpu_info->processes, 0, _gpu_info->processes_count * sizeof(*_gpu_info->processes));
+      unsigned valid_procs = 0;
       for (unsigned i = 0; i < graphical_count + compute_count; ++i) {
-        if (i < graphical_count)
-          _gpu_info->processes[i].type = gpu_process_graphical;
-        else
-          _gpu_info->processes[i].type = gpu_process_compute;
+        pid_t parsed_pid = -1;
         switch (version) {
         case 2: {
           nvmlProcessInfo_v2_t *pinfo = (nvmlProcessInfo_v2_t *)retrieved_infos;
-          _gpu_info->processes[i].pid = pinfo[i].pid;
-          _gpu_info->processes[i].gpu_memory_usage = pinfo[i].usedGpuMemory;
+          parsed_pid = pinfo[i].pid;
+          _gpu_info->processes[valid_procs].pid = pinfo[i].pid;
+          _gpu_info->processes[valid_procs].gpu_memory_usage = pinfo[i].usedGpuMemory;
         } break;
         case 3: {
           nvmlProcessInfo_v3_t *pinfo = (nvmlProcessInfo_v3_t *)retrieved_infos;
-          _gpu_info->processes[i].pid = pinfo[i].pid;
-          _gpu_info->processes[i].gpu_memory_usage = pinfo[i].usedGpuMemory;
+          parsed_pid = pinfo[i].pid;
+          _gpu_info->processes[valid_procs].pid = pinfo[i].pid;
+          _gpu_info->processes[valid_procs].gpu_memory_usage = pinfo[i].usedGpuMemory;
         } break;
         default: {
           nvmlProcessInfo_v1_t *pinfo = (nvmlProcessInfo_v1_t *)retrieved_infos;
-          _gpu_info->processes[i].pid = pinfo[i].pid;
-          _gpu_info->processes[i].gpu_memory_usage = pinfo[i].usedGpuMemory;
+          parsed_pid = pinfo[i].pid;
+          _gpu_info->processes[valid_procs].pid = pinfo[i].pid;
+          _gpu_info->processes[valid_procs].gpu_memory_usage = pinfo[i].usedGpuMemory;
         } break;
         }
-        SET_VALID(gpuinfo_process_gpu_memory_usage_valid, _gpu_info->processes[i].valid);
+
+        // Do not display nvtop in its own interface.
+        if (parsed_pid == getpid()) {
+          continue;
+        }
+
+        if (i < graphical_count)
+          _gpu_info->processes[valid_procs].type = gpu_process_graphical;
+        else
+          _gpu_info->processes[valid_procs].type = gpu_process_compute;
+
+        SET_VALID(gpuinfo_process_gpu_memory_usage_valid, _gpu_info->processes[valid_procs].valid);
+        valid_procs++;
       }
+      _gpu_info->processes_count = valid_procs;
     }
   }
   // If the GPU is in MIG mode; process utilization is not supported
diff --git a/src/extract_processinfo_fdinfo.c b/src/extract_processinfo_fdinfo.c
index b8be7dc1..a2218a48 100644
--- a/src/extract_processinfo_fdinfo.c
+++ b/src/extract_processinfo_fdinfo.c
@@ -135,6 +135,10 @@ void processinfo_sweep_fdinfos(void) {
     if (!client_pid)
       goto next;
 
+    // Do not show nvtop itself in its internal processes list.
+    if (client_pid == (unsigned int)getpid())
+      goto next;
+
     fd_dir_fd = openat(pid_dir_fd, "fd", O_DIRECTORY);
     if (fd_dir_fd < 0)
       goto next;
@@ -277,12 +281,10 @@ void processinfo_sweep_fdinfos(void) {
                             process_info->dec_engine_used + processes_info_local.dec_engine_used);
       }
       if (GPUINFO_PROCESS_FIELD_VALID(&processes_info_local, gpu_cycles)) {
-        SET_GPUINFO_PROCESS(process_info, gpu_cycles,
-                            process_info->gpu_cycles + processes_info_local.gpu_cycles);
+        SET_GPUINFO_PROCESS(process_info, gpu_cycles, process_info->gpu_cycles + processes_info_local.gpu_cycles);
       }
       if (GPUINFO_PROCESS_FIELD_VALID(&processes_info_local, sample_delta)) {
-        SET_GPUINFO_PROCESS(process_info, sample_delta,
-                            process_info->sample_delta + processes_info_local.sample_delta);
+        SET_GPUINFO_PROCESS(process_info, sample_delta, process_info->sample_delta + processes_info_local.sample_delta);
       }
     }
 
diff --git a/src/interface.c b/src/interface.c
index 3aa80463..6b695735 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -19,9 +19,9 @@
  *
  */
 
+#include "nvtop/interface.h"
 #include "nvtop/common.h"
 #include "nvtop/extract_gpuinfo_common.h"
-#include "nvtop/interface.h"
 #include "nvtop/interface_common.h"
 #include "nvtop/interface_internal_common.h"
 #include "nvtop/interface_layout_selection.h"
@@ -43,7 +43,7 @@
 #include <unistd.h>
 
 static unsigned int sizeof_device_field[device_field_count] = {
-    [device_name] = 11,       [device_fan_speed] = 11,   [device_temperature] = 10, [device_power] = 15,
+    [device_name] = 11,       [device_fan_speed] = 26,   [device_temperature] = 10, [device_power] = 15,
     [device_clock] = 11,      [device_mem_clock] = 12,   [device_pcie] = 46,        [device_shadercores] = 7,
     [device_l2features] = 11, [device_execengines] = 11,
 };
@@ -761,8 +761,19 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
     // FAN
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_speed)) {
-      mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%%  ",
-                device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
+      if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm) &&
+          GPUINFO_STATIC_FIELD_VALID(&device->static_info, fan_rpm_max)) {
+        mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%% [%u/%u RPM]",
+                  device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed,
+                  device->dynamic_info.fan_rpm, device->static_info.fan_rpm_max);
+      } else if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm)) {
+        mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%% [%u RPM]",
+                  device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed,
+                  device->dynamic_info.fan_rpm);
+      } else {
+        mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%%  ",
+                  device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
+      }
       mvwchgat(dev->fan_speed, 0, 1, 3, 0, cyan_color, NULL);
     } else if (device->static_info.integrated_graphics) {
       mvwprintw(dev->fan_speed, 0, 0, "  CPU-FAN  ");
@@ -2094,6 +2105,18 @@ bool show_information_messages(unsigned num_messages, const char **messages) {
   return dontShowAgainOption;
 }
 
+static void format_memory(char *buf, size_t bufsz, uint64_t bytes) {
+  // Always convert to MiB (1024^2 bytes)
+  double val = (double)bytes / (1024.0 * 1024.0);
+  // Format with up to two decimal places (adjust as needed)
+  if (val >= 100.0)
+    snprintf(buf, bufsz, "%.1f MiB", val);
+  else if (val >= 10.0)
+    snprintf(buf, bufsz, "%.2f MiB", val);
+  else
+    snprintf(buf, bufsz, "%.2f MiB", val); // for small values, still two decimals
+}
+
 void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
   struct gpu_info *device;
 
@@ -2104,74 +2127,145 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
     const char *indent_level_six = "     ";
     const char *indent_level_eight = "       ";
 
-    const char *device_name_field = "device_name";
-    const char *gpu_clock_field = "gpu_clock";
-    const char *mem_clock_field = "mem_clock";
-    const char *temp_field = "temp";
-    const char *fan_field = "fan_speed";
-    const char *power_field = "power_draw";
-    const char *gpu_util_field = "gpu_util";
-    const char *mem_util_field = "mem_util";
-    const char *mem_total_field = "mem_total";
-    const char *mem_used_field = "mem_used";
-    const char *mem_free_field = "mem_free";
-
     printf("%s{\n", indent_level_two);
 
-    // Device Name
+    // ----- PCI address -----
+    printf("%s\"pci\": \"%s\",\n", indent_level_four, device->pdev);
+
+    // ----- Static info -----
+    printf("%s\"integrated_graphics\": %s,\n", indent_level_four,
+           device->static_info.integrated_graphics ? "true" : "false");
+    printf("%s\"encode_decode_shared\": %s,\n", indent_level_four,
+           device->static_info.encode_decode_shared ? "true" : "false");
+
+    // Slowdown threshold
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, temperature_slowdown_threshold)) {
+      unsigned int temp_raw = device->static_info.temperature_slowdown_threshold;
+      unsigned int temp_celsius = temp_raw / 1000; // convert to degrees
+      unsigned int temp_convert;
+      if (!use_fahrenheit_option)
+        temp_convert = temp_celsius;
+      else
+        temp_convert = (unsigned)(32 + nearbyint(temp_celsius * 1.8));
+      printf("%s\"temp_slowdown_threshold\": \"%u%s\",\n", indent_level_four, temp_convert,
+             use_fahrenheit_option ? "F" : "C");
+    } else {
+      printf("%s\"temp_slowdown_threshold\": null,\n", indent_level_four);
+    }
+
+    // Shutdown threshold
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, temperature_shutdown_threshold)) {
+      unsigned int temp_raw = device->static_info.temperature_shutdown_threshold;
+      unsigned int temp_celsius = temp_raw / 1000;
+      unsigned int temp_convert;
+      if (!use_fahrenheit_option)
+        temp_convert = temp_celsius;
+      else
+        temp_convert = (unsigned)(32 + nearbyint(temp_celsius * 1.8));
+      printf("%s\"temp_shutdown_threshold\": \"%u%s\",\n", indent_level_four, temp_convert,
+             use_fahrenheit_option ? "F" : "C");
+    } else {
+      printf("%s\"temp_shutdown_threshold\": null,\n", indent_level_four);
+    }
+
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, max_pcie_gen))
+      printf("%s\"max_pcie_gen\": %u,\n", indent_level_four, device->static_info.max_pcie_gen);
+    else
+      printf("%s\"max_pcie_gen\": null,\n", indent_level_four);
+
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, max_pcie_link_width))
+      printf("%s\"max_pcie_link_width\": %u,\n", indent_level_four, device->static_info.max_pcie_link_width);
+    else
+      printf("%s\"max_pcie_link_width\": null,\n", indent_level_four);
+
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, n_shared_cores))
+      printf("%s\"n_shared_cores\": %u,\n", indent_level_four, device->static_info.n_shared_cores);
+
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, l2cache_size))
+      printf("%s\"l2cache_size\": %u,\n", indent_level_four, device->static_info.l2cache_size);
+
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, n_exec_engines))
+      printf("%s\"n_exec_engines\": %u,\n", indent_level_four, device->static_info.n_exec_engines);
+
+    // ----- Dynamic info (current + max where available) -----
+    // Device name (original field)
     if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, device_name))
-      printf("%s\"%s\": \"%s\",\n", indent_level_four, device_name_field, device->static_info.device_name);
+      printf("%s\"device_name\": \"%s\",\n", indent_level_four, device->static_info.device_name);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, device_name_field);
+      printf("%s\"device_name\": null,\n", indent_level_four);
 
-    // GPU Clock Speed
+    // GPU clock (current and max)
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, gpu_clock_speed))
-      printf("%s\"%s\": \"%uMHz\",\n", indent_level_four, gpu_clock_field, device->dynamic_info.gpu_clock_speed);
+      printf("%s\"gpu_clock\": \"%uMHz\",\n", indent_level_four, device->dynamic_info.gpu_clock_speed);
+    else
+      printf("%s\"gpu_clock\": null,\n", indent_level_four);
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, gpu_clock_speed_max))
+      printf("%s\"gpu_clock_max\": \"%uMHz\",\n", indent_level_four, device->dynamic_info.gpu_clock_speed_max);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, gpu_clock_field);
+      printf("%s\"gpu_clock_max\": null,\n", indent_level_four);
 
-    // MEM Clock Speed
+    // MEM clock (current and max)
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, mem_clock_speed))
-      printf("%s\"%s\": \"%uMHz\",\n", indent_level_four, mem_clock_field, device->dynamic_info.mem_clock_speed);
+      printf("%s\"mem_clock\": \"%uMHz\",\n", indent_level_four, device->dynamic_info.mem_clock_speed);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, mem_clock_field);
+      printf("%s\"mem_clock\": null,\n", indent_level_four);
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, mem_clock_speed_max))
+      printf("%s\"mem_clock_max\": \"%uMHz\",\n", indent_level_four, device->dynamic_info.mem_clock_speed_max);
+    else
+      printf("%s\"mem_clock_max\": null,\n", indent_level_four);
 
-    // GPU Temperature
+    // GPU temperature (current)
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, gpu_temp)) {
       unsigned int temp_convert;
       if (!use_fahrenheit_option)
         temp_convert = device->dynamic_info.gpu_temp;
       else
         temp_convert = (unsigned)(32 + nearbyint(device->dynamic_info.gpu_temp * 1.8));
-
-      printf("%s\"%s\": \"%u%s\",\n", indent_level_four, temp_field, temp_convert, use_fahrenheit_option ? "F" : "C");
+      printf("%s\"temp\": \"%u%s\",\n", indent_level_four, temp_convert, use_fahrenheit_option ? "F" : "C");
     } else {
-      printf("%s\"%s\": null,\n", indent_level_four, temp_field);
+      printf("%s\"temp\": null,\n", indent_level_four);
     }
 
-    // Fan speed
-    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_speed))
-      printf("%s\"%s\": \"%u%%\",\n", indent_level_four, fan_field,
+    // Fan speed (percentage or RPM fallback)
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_speed)) {
+      printf("%s\"fan_speed\": \"%u%%\",\n", indent_level_four,
              device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
-    else if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm))
-      printf("%s\"%s\": \"%uRPM\",\n", indent_level_four, fan_field,
+    } else if (device->static_info.integrated_graphics) {
+      printf("%s\"fan_speed\": \"CPU Fan\",\n", indent_level_four);
+    } else {
+      printf("%s\"fan_speed\": null,\n", indent_level_four);
+    }
+
+    // Fan RPM (raw data)
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm)) {
+      printf("%s\"fan_rpm\": \"%u\",\n", indent_level_four,
              device->dynamic_info.fan_rpm > 9999 ? 9999 : device->dynamic_info.fan_rpm);
-    else if (device->static_info.integrated_graphics)
-      printf("%s\"%s\": \"CPU Fan\",\n", indent_level_four, fan_field);
-    else
-      printf("%s\"%s\": null,\n", indent_level_four, fan_field);
+    } else {
+      printf("%s\"fan_rpm\": null,\n", indent_level_four);
+    }
 
-    // Power draw
+    // Fan RPM Max
+    if (GPUINFO_STATIC_FIELD_VALID(&device->static_info, fan_rpm_max)) {
+      printf("%s\"fan_rpm_max\": \"%u\",\n", indent_level_four, device->static_info.fan_rpm_max);
+    } else {
+      printf("%s\"fan_rpm_max\": null,\n", indent_level_four);
+    }
+
+    // Power draw (current and max)
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, power_draw))
-      printf("%s\"%s\": \"%uW\",\n", indent_level_four, power_field, device->dynamic_info.power_draw / 1000);
+      printf("%s\"power_draw\": \"%uW\",\n", indent_level_four, device->dynamic_info.power_draw / 1000);
+    else
+      printf("%s\"power_draw\": null,\n", indent_level_four);
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, power_draw_max))
+      printf("%s\"power_draw_max\": \"%uW\",\n", indent_level_four, device->dynamic_info.power_draw_max / 1000);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, power_field);
+      printf("%s\"power_draw_max\": null,\n", indent_level_four);
 
-    // GPU Utilization
+    // GPU utilization
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, gpu_util_rate))
-      printf("%s\"%s\": \"%u%%\",\n", indent_level_four, gpu_util_field, device->dynamic_info.gpu_util_rate);
+      printf("%s\"gpu_util\": \"%u%%\",\n", indent_level_four, device->dynamic_info.gpu_util_rate);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, gpu_util_field);
+      printf("%s\"gpu_util\": null,\n", indent_level_four);
 
     // Encode / Decode
     if (device->static_info.encode_decode_shared) {
@@ -2193,28 +2287,58 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
         printf("null,\n");
     }
 
-    // Memory Utilization
+    // Memory utilization
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, mem_util_rate))
-      printf("%s\"%s\": \"%u%%\",\n", indent_level_four, mem_util_field, device->dynamic_info.mem_util_rate);
+      printf("%s\"mem_util\": \"%u%%\",\n", indent_level_four, device->dynamic_info.mem_util_rate);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, mem_util_field);
-    // Memory Total
-    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, total_memory))
-      printf("%s\"%s\": \"%llu\",\n", indent_level_four, mem_total_field, device->dynamic_info.total_memory);
+      printf("%s\"mem_util\": null,\n", indent_level_four);
+
+    // Memory total / used / free (human-readable)
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, total_memory)) {
+      char mem_buf[32];
+      format_memory(mem_buf, sizeof(mem_buf), device->dynamic_info.total_memory);
+      printf("%s\"mem_total\": \"%s\",\n", indent_level_four, mem_buf);
+    } else {
+      printf("%s\"mem_total\": null,\n", indent_level_four);
+    }
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, used_memory)) {
+      char mem_buf[32];
+      format_memory(mem_buf, sizeof(mem_buf), device->dynamic_info.used_memory);
+      printf("%s\"mem_used\": \"%s\",\n", indent_level_four, mem_buf);
+    } else {
+      printf("%s\"mem_used\": null,\n", indent_level_four);
+    }
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, free_memory)) {
+      char mem_buf[32];
+      format_memory(mem_buf, sizeof(mem_buf), device->dynamic_info.free_memory);
+      printf("%s\"mem_free\": \"%s\",\n", indent_level_four, mem_buf);
+    } else {
+      printf("%s\"mem_free\": null,\n", indent_level_four);
+    }
+
+    // PCIe link status (current)
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_link_gen))
+      printf("%s\"pcie_link_gen\": %u,\n", indent_level_four, device->dynamic_info.pcie_link_gen);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, mem_total_field);
-    // Memory Used
-    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, used_memory))
-      printf("%s\"%s\": \"%llu\",\n", indent_level_four, mem_used_field, device->dynamic_info.used_memory);
+      printf("%s\"pcie_link_gen\": null,\n", indent_level_four);
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_link_width))
+      printf("%s\"pcie_link_width\": %u,\n", indent_level_four, device->dynamic_info.pcie_link_width);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, mem_used_field);
-    // Memory Available
-    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, free_memory))
-      printf("%s\"%s\": \"%llu\",\n", indent_level_four, mem_free_field, device->dynamic_info.free_memory);
+      printf("%s\"pcie_link_width\": null,\n", indent_level_four);
+
+    // PCIe bandwidth (KiB/s)
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_rx))
+      printf("%s\"pcie_rx\": \"%uKiB/s\",\n", indent_level_four, device->dynamic_info.pcie_rx);
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_tx))
+      printf("%s\"pcie_tx\": \"%uKiB/s\",\n", indent_level_four, device->dynamic_info.pcie_tx);
+
+    // Effective load rate
+    if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, effective_load_rate))
+      printf("%s\"effective_load_rate\": \"%u%%\",\n", indent_level_four, device->dynamic_info.effective_load_rate);
     else
-      printf("%s\"%s\": null,\n", indent_level_four, mem_free_field);
+      printf("%s\"effective_load_rate\": null,\n", indent_level_four);
 
-    // Processes
+    // ----- Processes -----
     printf("%s\"processes\" : [\n", indent_level_four);
     for (unsigned i = 0; i < device->processes_count; ++i) {
       struct gpu_process *proc = &device->processes[i];
@@ -2223,9 +2347,9 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
       // PID
       printf("%s\"pid\": \"%d\",\n", indent_level_eight, proc->pid);
 
+      // Command line (escaped)
       printf("%s\"cmdline\": \"", indent_level_eight);
       for (char *li = proc->cmdline; *li != '\0'; li++) {
-        // We need to escape some characters for for json strings
         if (*li == '\n') {
           printf("\\n");
           continue;
@@ -2242,13 +2366,13 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
           printf("\\t");
           continue;
         }
-        // escaping backslash and quotes
         if (*li == '\\' || *li == '"')
           printf("\\");
         printf("%c", *li);
       }
       printf("\",\n");
 
+      // Process type
       printf("%s\"kind\": ", indent_level_eight);
       if (proc->type != gpu_process_unknown) {
         printf("\"");
@@ -2272,7 +2396,7 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
       }
       printf(",\n");
 
-      // GPU memory usage
+      // User
       printf("%s\"user\": ", indent_level_eight);
       if (GPUINFO_PROCESS_FIELD_VALID(proc, user_name))
         printf("\"%s\",\n", proc->user_name);
@@ -2286,23 +2410,25 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
       else
         printf("null,\n");
 
-      // GPU memory usage
+      // GPU memory bytes (human-readable)
       printf("%s\"gpu_mem_bytes_alloc\": ", indent_level_eight);
-      if (GPUINFO_PROCESS_FIELD_VALID(proc, gpu_memory_usage))
-        printf("\"%llu\",\n", proc->gpu_memory_usage);
-      else
+      if (GPUINFO_PROCESS_FIELD_VALID(proc, gpu_memory_usage)) {
+        char mem_buf[32];
+        format_memory(mem_buf, sizeof(mem_buf), proc->gpu_memory_usage);
+        printf("\"%s\",\n", mem_buf);
+      } else {
         printf("null,\n");
+      }
 
-      // GPU memory usage
+      // GPU memory percentage
       printf("%s\"gpu_mem_usage\": ", indent_level_eight);
       if (GPUINFO_PROCESS_FIELD_VALID(proc, gpu_memory_percentage))
         printf("\"%u%%\",\n", proc->gpu_memory_percentage);
       else
         printf("null,\n");
 
-      // Encode usage
+      // Encode / decode
       if (device->static_info.encode_decode_shared) {
-        // (Notice: no comma at the end as it's the last field here)
         printf("%s\"encode_decode\": ", indent_level_eight);
         if (GPUINFO_PROCESS_FIELD_VALID(proc, decode_usage))
           printf("\"%u%%\"\n", proc->decode_usage);
@@ -2314,7 +2440,6 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
           printf("\"%u%%\",\n", proc->encode_usage);
         else
           printf("null,\n");
-        // (Notice: no comma at the end as it's the last field here)
         printf("%s\"decode\": ", indent_level_eight);
         if (GPUINFO_PROCESS_FIELD_VALID(proc, decode_usage))
           printf("\"%u%%\"\n", proc->decode_usage);
@@ -2327,9 +2452,9 @@ void print_snapshot(struct list_head *devices, bool use_fahrenheit_option) {
         printf(",");
       printf("\n");
     }
-    // (Notice: no comma at the end as it's the last field here)
     printf("%s]\n", indent_level_four);
 
+    // Close device object
     if (device->list.next == devices)
       printf("%s}\n", indent_level_two);
     else
diff --git a/src/nvtop.c b/src/nvtop.c
index 67954448..d32b96c6 100644
--- a/src/nvtop.c
+++ b/src/nvtop.c
@@ -28,6 +28,7 @@
 #include "nvtop/version.h"
 
 #include <getopt.h>
+#include <math.h>
 #include <ncurses.h>
 #include <signal.h>
 #include <stdbool.h>
@@ -58,6 +59,9 @@ static void cont_handler(int signum) {
   signal_cont_received = 1;
 }
 
+bool nvtop_debug_amdgpu_metrics = false;
+bool nvtop_enable_pcie_bw_sleep = false;
+
 static const char helpstring[] = "Available options:\n"
                                  "  -d --delay        : Select the refresh rate (1 == 0.1s)\n"
                                  "  -v --version      : Print the version and exit\n"
@@ -75,8 +79,9 @@ static const char helpstring[] = "Available options:\n"
                                  "(default 30s, negative = always on screen)\n"
                                  "  -h --help         : Print help and exit\n"
                                  "  -s --snapshot     : Output the current gpu stats without ncurses"
-                                 "(useful for scripting)\n"
-                                 "  -l --loop         : Output the current gpu stats without ncurses in a loop\n";
+                                 "  -l --loop         : Output the current gpu stats without ncurses in a loop\n"
+                                 "  -S --pciespeed    : Forces 1-second delay for PCIe bandwidth fallback (AMD only)\n"
+                                 "  -D --debug        : Output raw gpu_metrics data to stderr (AMD only)\n";
 
 static const char versionString[] = "nvtop version " NVTOP_VERSION_STRING;
 
@@ -95,10 +100,12 @@ static const struct option long_opts[] = {
     {.name = "reverse-abs", .has_arg = no_argument, .flag = NULL, .val = 'r'},
     {.name = "snapshot", .has_arg = no_argument, .flag = NULL, .val = 's'},
     {.name = "loop", .has_arg = no_argument, .flag = NULL, .val = 'l'},
+    {.name = "pciespeed", .has_arg = no_argument, .flag = NULL, .val = 'S'},
+    {.name = "debug", .has_arg = no_argument, .flag = NULL, .val = 'D'},
     {0, 0, 0, 0},
 };
 
-static const char opts[] = "hvd:c:CfE:pPrisl";
+static const char opts[] = "hvd:c:CfE:pPrislSD";
 
 int main(int argc, char **argv) {
   (void)setlocale(LC_CTYPE, "");
@@ -181,6 +188,12 @@ int main(int argc, char **argv) {
     case 'l':
       loop_snapshot = true;
       break;
+    case 'D':
+      nvtop_debug_amdgpu_metrics = true;
+      break;
+    case 'S':
+      nvtop_enable_pcie_bw_sleep = true;
+      break;
     case ':':
     case '?':
       switch (optopt) {
@@ -233,27 +246,53 @@ int main(int argc, char **argv) {
     return EXIT_SUCCESS;
   }
 
+  gpuinfo_populate_static_infos(&monitoredGpus);
+
+  // Pre-warm the cycle-based metrics by taking an initial reading here.
+  // This allows the ensuing setup time (e.g. sysfs parsing, curses init) to
+  // count towards the 100ms time delta needed to calculate load percentages
+  // before the first frame is drawn.
+  gpuinfo_refresh_dynamic_info(&monitoredGpus);
+  gpuinfo_refresh_processes(&monitoredGpus);
+  gpuinfo_utilisation_rate(&monitoredGpus);
+
+  nvtop_time time_startup_refresh;
+  nvtop_get_current_time(&time_startup_refresh);
+
   if (show_snapshot || loop_snapshot) {
-    gpuinfo_populate_static_infos(&monitoredGpus);
-
-    // Always do a refresh followed by a short sleep to have valid cycle based
-    // metrics
-    gpuinfo_refresh_dynamic_info(&monitoredGpus);
-    gpuinfo_refresh_processes(&monitoredGpus);
-    gpuinfo_utilisation_rate(&monitoredGpus);
-    // Default to 0.1 sec
+    // Default to 0.1 sec if not given
     if (!update_interval_option_set)
       update_interval_option = 100;
 
+    bool first_snapshot = true;
+
     do {
+      if (first_snapshot) {
+        nvtop_time time_before_snap;
+        nvtop_get_current_time(&time_before_snap);
+        double startup_elapsed_ms = nvtop_difftime(time_startup_refresh, time_before_snap) * 1000.0;
+
+        if (startup_elapsed_ms < update_interval_option) {
+          double remaining_ms = update_interval_option - startup_elapsed_ms;
+#if _POSIX_C_SOURCE >= 199309L
+          struct timespec tv = {.tv_sec = (long)(remaining_ms / 1000.0),
+                                .tv_nsec = (long)(fmod(remaining_ms, 1000.0) * 1000000.0)};
+          nanosleep(&tv, &tv);
+#else
+          usleep((useconds_t)(remaining_ms * 1000.0));
+#endif
+        }
+        first_snapshot = false;
+      } else {
 #if _POSIX_C_SOURCE >= 199309L
-      struct timespec tv = {.tv_sec = update_interval_option / 1000,
-                            .tv_nsec = (update_interval_option % 1000) * 1000000};
-      nanosleep(&tv, &tv);
+        struct timespec tv = {.tv_sec = update_interval_option / 1000,
+                              .tv_nsec = (update_interval_option % 1000) * 1000000};
+        nanosleep(&tv, &tv);
 #else
-      int sec = update_interval_option / 1000;
-      sleep(sec > 0 ? sec : 1);
+        int sec = update_interval_option / 1000;
+        sleep(sec > 0 ? sec : 1);
 #endif
+      }
       gpuinfo_refresh_dynamic_info(&monitoredGpus);
       gpuinfo_refresh_processes(&monitoredGpus);
       gpuinfo_utilisation_rate(&monitoredGpus);
@@ -308,7 +347,6 @@ int main(int argc, char **argv) {
     allDevicesOptions.update_interval = update_interval_option;
   allDevicesOptions.has_gpu_info_bar = allDevicesOptions.has_gpu_info_bar || show_gpu_info_bar;
 
-  gpuinfo_populate_static_infos(&monitoredGpus);
   unsigned numMonitoredGpus =
       interface_check_and_fix_monitored_gpus(allDevCount, &monitoredGpus, &nonMonitoredGpus, &allDevicesOptions);
 
@@ -320,6 +358,22 @@ int main(int argc, char **argv) {
     }
   }
 
+  // Ensure at least 100ms has elapsed since the pre-warm metrics were taken
+  // to guarantee a valid time delta for load percent calculations.
+  nvtop_time time_before_ui;
+  nvtop_get_current_time(&time_before_ui);
+  double startup_elapsed_ms = nvtop_difftime(time_startup_refresh, time_before_ui) * 1000.0;
+
+  if (startup_elapsed_ms < 100.0) {
+    double remaining_ms = 100.0 - startup_elapsed_ms;
+#if _POSIX_C_SOURCE >= 199309L
+    struct timespec tv = {.tv_sec = 0, .tv_nsec = (long)(remaining_ms * 1000000.0)};
+    nanosleep(&tv, &tv);
+#else
+    usleep((useconds_t)(remaining_ms * 1000.0));
+#endif
+  }
+
   struct nvtop_interface *interface =
       initialize_curses(allDevCount, numMonitoredGpus, interface_largest_gpu_name(&monitoredGpus), allDevicesOptions);
   timeout(interface_update_interval(interface));