From 332b9f99ddbe3d8807a250ce83e16b6261e52ff1 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:24:17 -0700 Subject: [PATCH 01/17] Revert "NVIDIA: VR: SAUCE: perf vendor events arm64: Add Tegra410 Olympus PMU events" This reverts commit cf682dc04b5c460266809f96c9122abd728cbfe4. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- .../arch/arm64/common-and-microarch.json | 90 --- tools/perf/pmu-events/arch/arm64/mapfile.csv | 1 - .../arch/arm64/nvidia/t410/branch.json | 45 -- .../arch/arm64/nvidia/t410/brbe.json | 6 - .../arch/arm64/nvidia/t410/bus.json | 48 -- .../arch/arm64/nvidia/t410/exception.json | 62 -- .../arch/arm64/nvidia/t410/fp_operation.json | 78 -- .../arch/arm64/nvidia/t410/general.json | 15 - .../arch/arm64/nvidia/t410/l1d_cache.json | 122 --- .../arch/arm64/nvidia/t410/l1i_cache.json | 114 --- .../arch/arm64/nvidia/t410/l2d_cache.json | 134 ---- .../arch/arm64/nvidia/t410/ll_cache.json | 107 --- .../arch/arm64/nvidia/t410/memory.json | 46 -- .../arch/arm64/nvidia/t410/metrics.json | 722 ------------------ .../arch/arm64/nvidia/t410/misc.json | 646 ---------------- .../arch/arm64/nvidia/t410/retired.json | 94 --- .../arch/arm64/nvidia/t410/spe.json | 42 - .../arm64/nvidia/t410/spec_operation.json | 230 ------ .../arch/arm64/nvidia/t410/stall.json | 145 ---- .../arch/arm64/nvidia/t410/tlb.json | 158 ---- 20 files changed, 2905 deletions(-) delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json delete mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json index 6af15776ff170..468cb085d8796 100644 --- a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json +++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json @@ -179,11 +179,6 @@ "EventName": "BUS_CYCLES", "BriefDescription": "Bus cycle" }, - { - "EventCode": "0x001E", - "EventName": "CHAIN", - "BriefDescription": "Chain a pair of event counters." - }, { "PublicDescription": "Level 1 data cache allocation without refill", "EventCode": "0x1F", @@ -1517,26 +1512,11 @@ "EventName": "L2D_CACHE_REFILL_PRFM", "BriefDescription": "Level 2 data cache refill, software preload" }, - { - "EventCode": "0x8150", - "EventName": "L3D_CACHE_RW", - "BriefDescription": "Level 3 data cache demand access." - }, - { - "EventCode": "0x8151", - "EventName": "L3D_CACHE_PRFM", - "BriefDescription": "Level 3 data cache software prefetch" - }, { "EventCode": "0x8152", "EventName": "L3D_CACHE_MISS", "BriefDescription": "Level 3 data cache demand access miss" }, - { - "EventCode": "0x8153", - "EventName": "L3D_CACHE_REFILL_PRFM", - "BriefDescription": "Level 3 data cache refill, software prefetch." - }, { "EventCode": "0x8154", "EventName": "L1D_CACHE_HWPRF", @@ -1547,11 +1527,6 @@ "EventName": "L2D_CACHE_HWPRF", "BriefDescription": "Level 2 data cache hardware prefetch." }, - { - "EventCode": "0x8156", - "EventName": "L3D_CACHE_HWPRF", - "BriefDescription": "Level 3 data cache hardware prefetch." - }, { "EventCode": "0x8158", "EventName": "STALL_FRONTEND_MEMBOUND", @@ -1707,11 +1682,6 @@ "EventName": "L2D_CACHE_REFILL_HWPRF", "BriefDescription": "Level 2 data cache refill, hardware prefetch." }, - { - "EventCode": "0x81BE", - "EventName": "L3D_CACHE_REFILL_HWPRF", - "BriefDescription": "Level 3 data cache refill, hardware prefetch." - }, { "EventCode": "0x81C0", "EventName": "L1I_CACHE_HIT_RD", @@ -1742,31 +1712,11 @@ "EventName": "L1I_CACHE_HIT_RD_FPRFM", "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by software preload" }, - { - "EventCode": "0x81DC", - "EventName": "L1D_CACHE_HIT_RW_FPRFM", - "BriefDescription": "Level 1 data cache demand access first hit, fetched by software prefetch." - }, { "EventCode": "0x81E0", "EventName": "L1I_CACHE_HIT_RD_FHWPRF", "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by hardware prefetcher" }, - { - "EventCode": "0x81EC", - "EventName": "L1D_CACHE_HIT_RW_FHWPRF", - "BriefDescription": "Level 1 data cache demand access first hit, fetched by hardware prefetcher." - }, - { - "EventCode": "0x81F0", - "EventName": "L1I_CACHE_HIT_RD_FPRF", - "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by prefetch." - }, - { - "EventCode": "0x81FC", - "EventName": "L1D_CACHE_HIT_RW_FPRF", - "BriefDescription": "Level 1 data cache demand access first hit, fetched by prefetch." - }, { "EventCode": "0x8200", "EventName": "L1I_CACHE_HIT", @@ -1817,26 +1767,11 @@ "EventName": "L1I_LFB_HIT_RD_FPRFM", "BriefDescription": "Level 1 instruction cache demand fetch line-fill buffer first hit, recently fetched by software preload" }, - { - "EventCode": "0x825C", - "EventName": "L1D_LFB_HIT_RW_FPRFM", - "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by software prefetch." - }, { "EventCode": "0x8260", "EventName": "L1I_LFB_HIT_RD_FHWPRF", "BriefDescription": "Level 1 instruction cache demand fetch line-fill buffer first hit, recently fetched by hardware prefetcher" }, - { - "EventCode": "0x826C", - "EventName": "L1D_LFB_HIT_RW_FHWPRF", - "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by hardware prefetcher." - }, - { - "EventCode": "0x827C", - "EventName": "L1D_LFB_HIT_RW_FPRF", - "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by prefetch." - }, { "EventCode": "0x8280", "EventName": "L1I_CACHE_PRF", @@ -1872,11 +1807,6 @@ "EventName": "LL_CACHE_REFILL", "BriefDescription": "Last level cache refill" }, - { - "EventCode": "0x828E", - "EventName": "L3D_CACHE_REFILL_PRF", - "BriefDescription": "Level 3 data cache refill, prefetch." - }, { "EventCode": "0x8320", "EventName": "L1D_CACHE_REFILL_PERCYC", @@ -1942,16 +1872,6 @@ "EventName": "FP_FP8_MIN_SPEC", "BriefDescription": "Floating-point operation speculatively_executed, smallest type is 8-bit floating-point." }, - { - "EventCode": "0x8480", - "EventName": "FP_SP_FIXED_MIN_OPS_SPEC", - "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is single-precision floating-point." - }, - { - "EventCode": "0x8482", - "EventName": "FP_HP_FIXED_MIN_OPS_SPEC", - "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is half-precision floating-point." - }, { "EventCode": "0x8483", "EventName": "FP_BF16_FIXED_MIN_OPS_SPEC", @@ -1962,16 +1882,6 @@ "EventName": "FP_FP8_FIXED_MIN_OPS_SPEC", "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is 8-bit floating-point." }, - { - "EventCode": "0x8488", - "EventName": "FP_SP_SCALE_MIN_OPS_SPEC", - "BriefDescription": "Scalable element arithmetic operations speculatively executed, smallest type is single-precision floating-point." - }, - { - "EventCode": "0x848A", - "EventName": "FP_HP_SCALE_MIN_OPS_SPEC", - "BriefDescription": "Scalable element arithmetic operations speculatively executed, smallest type is half-precision floating-point." - }, { "EventCode": "0x848B", "EventName": "FP_BF16_SCALE_MIN_OPS_SPEC", diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 7f0eaa7020485..bb3fa8a33496a 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -46,4 +46,3 @@ 0x00000000500f0000,v1,ampere/emag,core 0x00000000c00fac30,v1,ampere/ampereone,core 0x00000000c00fac40,v1,ampere/ampereonex,core -0x000000004e0f0100,v1,nvidia/t410,core diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json deleted file mode 100644 index 532bc59dc573e..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json +++ /dev/null @@ -1,45 +0,0 @@ -[ - { - "ArchStdEvent": "BR_MIS_PRED", - "PublicDescription": "The Event counts Branches which are speculatively executed and mis-predicted." - }, - { - "ArchStdEvent": "BR_PRED", - "PublicDescription": "The Event counts all speculatively executed Branches." - }, - { - "EventCode": "0x017e", - "EventName": "BR_PRED_BTB_CTX_UPDATE", - "PublicDescription": "Branch context table update." - }, - { - "EventCode": "0x0188", - "EventName": "BR_MIS_PRED_DIR_RESOLVED", - "PublicDescription": "Number of Branch misprediction due to direction misprediction." - }, - { - "EventCode": "0x0189", - "EventName": "BR_MIS_PRED_DIR_UNCOND_RESOLVED", - "PublicDescription": "Number of Branch misprediction due to direction misprediction for unconditional Branches." - }, - { - "EventCode": "0x018a", - "EventName": "BR_MIS_PRED_DIR_UNCOND_DIRECT_RESOLVED", - "PublicDescription": "Number of Branch misprediction due to direction misprediction for unconditional direct Branches." - }, - { - "EventCode": "0x018b", - "EventName": "BR_PRED_MULTI_RESOLVED", - "PublicDescription": "Number of resolved branch which made prediction by polymorphic indirect predictor." - }, - { - "EventCode": "0x018c", - "EventName": "BR_MIS_PRED_MULTI_RESOLVED", - "PublicDescription": "Number of branch misprediction which made prediction by polymorphic indirect predictor." - }, - { - "EventCode": "0x01e4", - "EventName": "BR_RGN_RECLAIM", - "PublicDescription": "The Event counts the Indirect predictor entries flushed by region reclamation." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json deleted file mode 100644 index 7c43a01c9707d..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json +++ /dev/null @@ -1,6 +0,0 @@ -[ - { - "ArchStdEvent": "BRB_FILTRATE", - "PublicDescription": "The Event counts each valid Branch record captured in the Branch record buffer. Branch records that are not captured because they are removed by filtering are not counted." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json deleted file mode 100644 index c4cee0be1242e..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json +++ /dev/null @@ -1,48 +0,0 @@ -[ - { - "ArchStdEvent": "BUS_ACCESS", - "PublicDescription": "The Event counts the number of Data-beat access between the CPU and the external bus. This count includes access due to Read, Write, and Snoop. Each beat of Data is counted individually." - }, - { - "ArchStdEvent": "BUS_CYCLES", - "PublicDescription": "The Event counts bus cycles in the CPU. Bus cycles represent a clock cycle in which a transaction could be sent or received on the interface from the CPU to the external bus. Since that interface is driven at the same clock speed as the CPU, this Event increments at the rate of CPU clock. Regardless of the WFE/WFI state of the PE, this Event increment on each processor clock." - }, - { - "ArchStdEvent": "BUS_ACCESS_RD", - "PublicDescription": "The Event counts memory Read transactions seen on the external bus. Each beat of Data is counted individually." - }, - { - "ArchStdEvent": "BUS_ACCESS_WR", - "PublicDescription": "The Event counts memory Write transactions seen on the external bus. Each beat of Data is counted individually." - }, - { - "EventCode": "0x0154", - "EventName": "BUS_REQUEST_REQ", - "PublicDescription": "Bus request, request." - }, - { - "EventCode": "0x0155", - "EventName": "BUS_REQUEST_RETRY", - "PublicDescription": "Bus request, retry." - }, - { - "EventCode": "0x0198", - "EventName": "L2_CHI_CBUSY0", - "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 0." - }, - { - "EventCode": "0x0199", - "EventName": "L2_CHI_CBUSY1", - "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 1." - }, - { - "EventCode": "0x019a", - "EventName": "L2_CHI_CBUSY2", - "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 2." - }, - { - "EventCode": "0x019b", - "EventName": "L2_CHI_CBUSY3", - "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 3." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json deleted file mode 100644 index 2f31fb2e67a25..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json +++ /dev/null @@ -1,62 +0,0 @@ -[ - { - "ArchStdEvent": "EXC_TAKEN", - "PublicDescription": "The Event counts any taken architecturally visible exceptions such as IRQ, FIQ, SError, and other synchronous exceptions. Exceptions are counted whether or not they are taken locally." - }, - { - "ArchStdEvent": "EXC_RETURN", - "PublicDescription": "The Event counts any architecturally executed exception return Instructions. For example: AArch64: ERET." - }, - { - "ArchStdEvent": "EXC_UNDEF", - "PublicDescription": "The Event counts the number of synchronous exceptions which are taken locally that are due to attempting to execute an Instruction that is UNDEFINED. Attempting to execute Instruction bit patterns that have not been allocated. Attempting to execute Instructions when they are disabled. Attempting to execute Instructions at an inappropriate Exception level. Attempting to execute an Instruction when the value of PSTATE.IL is 1." - }, - { - "ArchStdEvent": "EXC_SVC", - "PublicDescription": "The Event counts SVC exceptions taken locally." - }, - { - "ArchStdEvent": "EXC_PABORT", - "PublicDescription": "The Event counts synchronous exceptions that are taken locally and caused by Instruction Aborts." - }, - { - "ArchStdEvent": "EXC_DABORT", - "PublicDescription": "The Event counts exceptions that are taken locally and are caused by Data aborts or SErrors. Conditions that could cause those exceptions are attempting to read or write memory where the MMU generates a fault, attempting to read or write memory with a misaligned address, Interrupts from the nSEI inputs and internally generated SErrors." - }, - { - "ArchStdEvent": "EXC_IRQ", - "PublicDescription": "The Event counts IRQ exceptions including the virtual IRQs that are taken locally." - }, - { - "ArchStdEvent": "EXC_FIQ", - "PublicDescription": "The Event counts FIQ exceptions including the virtual FIQs that are taken locally." - }, - { - "ArchStdEvent": "EXC_SMC", - "PublicDescription": "The Event counts SMC exceptions take to EL3." - }, - { - "ArchStdEvent": "EXC_HVC", - "PublicDescription": "The Event counts HVC exceptions taken to EL2." - }, - { - "ArchStdEvent": "EXC_TRAP_PABORT", - "PublicDescription": "The Event counts exceptions which are traps not taken locally and are caused by Instruction Aborts. For example, attempting to execute an Instruction with a misaligned PC." - }, - { - "ArchStdEvent": "EXC_TRAP_DABORT", - "PublicDescription": "The Event counts exceptions which are traps not taken locally and are caused by Data Aborts or SError Interrupts. Conditions that could cause those exceptions are: * Attempting to read or write memory where the MMU generates a fault, * Attempting to read or write memory with a misaligned address, * Interrupts from the SEI input. * internally generated SErrors." - }, - { - "ArchStdEvent": "EXC_TRAP_OTHER", - "PublicDescription": "The Event counts the number of synchronous trap exceptions which are not taken locally and are not SVC, SMC, HVC, Data aborts, Instruction Aborts, or Interrupts." - }, - { - "ArchStdEvent": "EXC_TRAP_IRQ", - "PublicDescription": "The Event counts IRQ exceptions including the virtual IRQs that are not taken locally." - }, - { - "ArchStdEvent": "EXC_TRAP_FIQ", - "PublicDescription": "The Event counts FIQs which are not taken locally but taken from EL0, EL1, or EL2 to EL3 (which would be the normal behavior for FIQs when not executing in EL3)." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json deleted file mode 100644 index 0b6d047207518..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json +++ /dev/null @@ -1,78 +0,0 @@ -[ - { - "ArchStdEvent": "FP_HP_SPEC", - "PublicDescription": "The Event counts speculatively executed half precision floating point operations." - }, - { - "ArchStdEvent": "FP_SP_SPEC", - "PublicDescription": "The Event counts speculatively executed single precision floating point operations." - }, - { - "ArchStdEvent": "FP_DP_SPEC", - "PublicDescription": "The Event counts speculatively executed double precision floating point operations." - }, - { - "ArchStdEvent": "FP_SCALE_OPS_SPEC", - "PublicDescription": "The Event counts speculatively executed scalable single precision floating point operations." - }, - { - "ArchStdEvent": "FP_FIXED_OPS_SPEC", - "PublicDescription": "The Event counts speculatively executed non-scalable single precision floating point operations." - }, - { - "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the largest type was half-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes the counter to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the largest type was half-precision floating-point, where v is the number of arithmetic operations carried out by the operation or Instruction which causes The even to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the largest type was single-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the largest type was single-precision floating-point, where v is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the largest type was double-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the largest type was double-precision floating-point, where v is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_BF16_FIXED_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the smallest type was BFloat16 floating-point. Where v is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_BF16_SCALE_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the smallest type was BFloat16 floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_FP8_FIXED_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the smallest type was 8-bit floating-point, where v is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_FP8_SCALE_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the smallest type was 8-bit floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_HP_FIXED_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the smallest type was half-precision floating-point, where v is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_HP_SCALE_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the smallest type was half-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_SP_FIXED_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an Instruction where the smallest type was single-precision floating-point, where v is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." - }, - { - "ArchStdEvent": "FP_SP_SCALE_MIN_OPS_SPEC", - "PublicDescription": "The Event increments by v for each speculatively executed scalable element arithmetic operation, due to an Instruction where the smallest type was single-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or Instruction which causes The Event to increment. The Event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json deleted file mode 100644 index bbeb4a75d2618..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json +++ /dev/null @@ -1,15 +0,0 @@ -[ - { - "ArchStdEvent": "CPU_CYCLES", - "PublicDescription": "The Event counts CPU clock cycles, when the PE is not in WFE/WFI. The clock measured by this Event is defined as the physical clock driving the CPU logic." - }, - { - "ArchStdEvent": "CNT_CYCLES", - "PublicDescription": "The Event increments at a constant frequency equal to the rate of increment of the System Counter, CNTPCT_EL0. This Event does not increment when the PE is in WFE/WFI." - }, - { - "EventCode": "0x01e1", - "EventName": "CPU_SLOT", - "PublicDescription": "Entitled CPU slots. The Event counts the number of slots. When in ST mode, this Event shall increment by PMMIR_EL1.SLOTS quantities, and when in SMT partitioned resource mode (regardless of in WFI state or otherwise), this Event is incremented by PMMIR_EL1.SLOTS/2 quantities." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json deleted file mode 100644 index 7c9b2fc4b38cb..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "ArchStdEvent": "L1D_CACHE_REFILL", - "PublicDescription": "The Event counts L1 D-cache refills caused by speculatively executed Load or Store operations or preload Instructions or hardware cache prefetching that missed in the L1 D-cache. This Event only counts one Event per cache line. Since the caches are Write-back only for this processor, there are no Write-through cache accesses." - }, - { - "ArchStdEvent": "L1D_CACHE", - "PublicDescription": "The Event counts L1 D-cache accesses from any Load/Store operations, software preload or hardware prefetch operation. Atomic operations that resolve in the CPUs caches (near atomic operations) counts as both a Write access and Read access. Each access to a cache line is counted including the multiple accesses caused by single Instructions such as LDM or STM. Each access to other L1 Data or unified memory structures, for example refill buffers, Write buffers, and Write-back buffers, are also counted. This Event counts the sum of L1D_CACHE_RD, L1D_CACHE_WR, L1D_CACHE_PRFM and L1D_CACHE_HWPRF." - }, - { - "ArchStdEvent": "L1D_CACHE_WB", - "PublicDescription": "The Event counts Write-backs of dirty Data from the L1 D-cache to the L2 cache. This occurs when either a dirty cache line is evicted from L1 D-cache and allocated in the L2 cache or dirty Data is written to the L2 and possibly to the next level of cache. This Event counts both victim cache line evictions and cache Write-backs from snoops or cache maintenance operations. The following cache operations are not counted: * Invalidations which do not result in Data being transferred out of the L1 (such as evictions of clean Data), * Full line Writes which write to L2 without writing L1, such as Write streaming mode. This Event is the sum of the L1D_CACHE_WB_CLEAN and L1D_CACHE_WB_VICTIM Events." - }, - { - "ArchStdEvent": "L1D_CACHE_LMISS_RD", - "PublicDescription": "The Event counts cache line refills into the L1 D-cache from any memory Read operations, that incurred additional latency. Counts same as L1D_CACHE_REFILL_RD on this CPU." - }, - { - "ArchStdEvent": "L1D_CACHE_RD", - "PublicDescription": "The Event counts L1 D-cache accesses from any Load operation. Atomic Load operations that resolve in the CPUs caches counts as both a Write access and Read access." - }, - { - "ArchStdEvent": "L1D_CACHE_WR", - "PublicDescription": "The Event counts L1 D-cache accesses generated by Store operations. This Event also counts accesses caused by a DC ZVA (D-cache zero, specified by virtual address) Instruction. Near atomic operations that resolve in the CPUs caches count as a Write access and Read access. This Event is a subset of the L1D_CACHE Event, except this Event only counts memory Write operations." - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_RD", - "PublicDescription": "The Event counts L1 D-cache refills caused by speculatively executed Load Instructions where the memory Read operation misses in the L1 D-cache. This Event only counts one Event per cache line. This Event is a subset of the L1D_CACHE_REFILL Event, but only counts memory Read operations. This Event does not count reads caused by cache maintenance operations or preload Instructions." - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_WR", - "PublicDescription": "The Event counts L1 D-cache refills caused by speculatively executed Store Instructions where the memory Write operation misses in the L1 D-cache. This Event only counts one Event per cache line. This Event is a subset of the L1D_CACHE_REFILL Event, but only counts memory Write operations." - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_INNER", - "PublicDescription": "The Event counts L1 D-cache refills (L1D_CACHE_REFILL) where the cache line Data came from caches inside the immediate Cluster of the Core (L2 cache)." - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_OUTER", - "PublicDescription": "The Event counts L1 D-cache refills (L1D_CACHE_REFILL) for which the cache line Data came from outside the immediate Cluster of the Core, like an SLC in the system interconnect or DRAM or remote socket." - }, - { - "ArchStdEvent": "L1D_CACHE_WB_VICTIM", - "PublicDescription": "The Event counts dirty cache line evictions from the L1 D-cache caused by a new cache line allocation. This Event does not count evictions caused by cache maintenance operations. This Event is a subset of the L1D_CACHE_WB Event, but the Event only counts Write-backs that are a result of the line being allocated for an access made by the CPU." - }, - { - "ArchStdEvent": "L1D_CACHE_WB_CLEAN", - "PublicDescription": "The Event counts Write-backs from the L1 D-cache that are a result of a coherency operation made by another CPU. Event count includes cache maintenance operations. This Event is a subset of the L1D_CACHE_WB Event." - }, - { - "ArchStdEvent": "L1D_CACHE_INVAL", - "PublicDescription": "The Event counts each explicit invalidation of a cache line in the L1 D-cache caused by: * Cache Maintenance Operations (CMO) that operate by a virtual address. * Broadcast cache coherency operations from another CPU in the system. This Event does not count for the following conditions: * A cache refill invalidates a cache line. * A CMO which is executed on that CPU and invalidates a cache line specified by Set/Way. Note that CMOs that operate by Set/Way cannot be broadcast from one CPU to another." - }, - { - "ArchStdEvent": "L1D_CACHE_RW", - "PublicDescription": "The Event counts L1 Data demand cache accesses from any Load or Store operation. Near atomic operations that resolve in the CPUs caches counts as both a Write access and Read access. This Event is implemented as L1D_CACHE_RD + L1D_CACHE_WR" - }, - { - "ArchStdEvent": "L1D_CACHE_PRFM", - "PublicDescription": "The Event counts L1 D-cache accesses from software preload or prefetch Instructions." - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_PRFM", - "PublicDescription": "The Event counts L1 D-cache refills where the cache line access was generated by software preload or prefetch Instructions." - }, - { - "ArchStdEvent": "L1D_CACHE_HWPRF", - "PublicDescription": "The Event counts L1 D-cache accesses from any Load/Store operations generated by the hardware prefetcher." - }, - { - "ArchStdEvent": "L1D_CACHE_MISS", - "PublicDescription": "The Event counts each demand access counted by L1D_CACHE_RW that misses in the L1 Data or unified cache, causing an access to outside of the L1 caches of this PE." - }, - { - "ArchStdEvent": "L1D_CACHE_REFILL_HWPRF", - "PublicDescription": "The Event counts each hardware prefetch access counted by L1D_CACHE_HWPRF that causes a refill of the L1 D-cache from outside of the L1 D-cache." - }, - { - "ArchStdEvent": "L1D_CACHE_HIT_RW_FPRFM", - "PublicDescription": "The Event counts each demand access first hit counted by L1D_CACHE_HIT_RW_FPRF where the cache line was fetched in response to a prefetch Instruction. That is, the L1D_CACHE_REFILL_PRFM Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "ArchStdEvent": "L1D_CACHE_HIT_RW_FHWPRF", - "PublicDescription": "The Event counts each demand access first hit counted by L1D_CACHE_HIT_RW_FPRF where the cache line was fetched by a hardware prefetcher. That is, the L1D_CACHE_REFILL_HWPRF Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "ArchStdEvent": "L1D_CACHE_HIT_RW_FPRF", - "PublicDescription": "The Event counts each demand access first hit counted by L1D_CACHE_HIT_RW where the cache line was fetched in response to a prefetch Instruction or by a hardware prefetcher. That is, the L1D_CACHE_REFILL_PRF Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "ArchStdEvent": "L1D_LFB_HIT_RW_FHWPRF", - "PublicDescription": "The Event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW_FPRF, where the cache line was fetched by a hardware prefetcher. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_HWPRF Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "ArchStdEvent": "L1D_LFB_HIT_RW_FPRFM", - "PublicDescription": "The Event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW_FPRF where the cache line was fetched in response to a prefetch Instruction. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and theL1D_CACHE_REFILL_PRFM Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "ArchStdEvent": "L1D_LFB_HIT_RW_FPRF", - "PublicDescription": "The Event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW where the cache line was fetched in response to a prefetch Instruction or by a hardware prefetcher. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_PRF Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x0204", - "EventName": "L1D_CACHE_REFILL_OUTER_LLC", - "PublicDescription": "The Event counts L1D_CACHE_REFILL from L3 D-cache." - }, - { - "EventCode": "0x0205", - "EventName": "L1D_CACHE_REFILL_OUTER_DRAM", - "PublicDescription": "The Event counts L1D_CACHE_REFILL from local memory." - }, - { - "EventCode": "0x0206", - "EventName": "L1D_CACHE_REFILL_OUTER_REMOTE", - "PublicDescription": "The Event counts L1D_CACHE_REFILL from a remote memory." - }, - { - "EventCode": "0x01f5", - "EventName": "L1D_CACHE_REFILL_RW", - "PublicDescription": "L1 D-cache refill, demand Read and Write. The Event counts demand Read and Write accesses that causes a refill of the L1 D-cache of this PE, from outside of this cache." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json deleted file mode 100644 index a1faa284b80f6..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json +++ /dev/null @@ -1,114 +0,0 @@ -[ - { - "ArchStdEvent": "L1I_CACHE_REFILL", - "PublicDescription": "The Event counts cache line refills in the L1 I-cache caused by a missed Instruction fetch (Demand, hardware prefetch and software preload accesses). Instruction fetches may include accessing multiple Instructions, but the single cache line allocation is counted once." - }, - { - "ArchStdEvent": "L1I_CACHE", - "PublicDescription": "The Event counts Instruction fetches (Demand, hardware prefetch and software preload accesses) which access the L1 Instruction Cache. Instruction Cache accesses caused by cache maintenance operations are not counted." - }, - { - "ArchStdEvent": "L1I_CACHE_LMISS", - "PublicDescription": "The Event counts cache line refills into the L1 I-cache, that incurred additional latency. Counts the same as L1I_CACHE_REFILL in this CPU." - }, - { - "ArchStdEvent": "L1I_CACHE_RD", - "PublicDescription": "The Event counts demand Instruction fetches which access the L1 I-cache." - }, - { - "ArchStdEvent": "L1I_CACHE_PRFM", - "PublicDescription": "The Event counts Instruction fetches generated by software preload or prefetch Instructions which access the L1 I-cache." - }, - { - "ArchStdEvent": "L1I_CACHE_HWPRF", - "PublicDescription": "The Event counts Instruction fetches which access the L1 I-cache generated by the hardware prefetcher." - }, - { - "ArchStdEvent": "L1I_CACHE_REFILL_PRFM", - "PublicDescription": "The Event counts cache line refills in the L1 I-cache caused by a missed Instruction fetch generated by software preload or prefetch Instructions. Instruction fetches may include accessing multiple Instructions, but the single cache line allocation is counted once." - }, - { - "ArchStdEvent": "L1I_CACHE_HIT_RD", - "PublicDescription": "The Event counts demand Instruction fetches that access the L1 I-cache and hit in the L1 I-cache." - }, - { - "ArchStdEvent": "L1I_CACHE_HIT", - "PublicDescription": "The Event counts Instruction fetches that access the L1 I-cache (Demand, hardware prefetch and software preload accesses) and hit in the L1 I-cache. I-cache accesses caused by cache maintenance operations are not counted." - }, - { - "ArchStdEvent": "L1I_CACHE_HIT_PRFM", - "PublicDescription": "The Event counts Instruction fetches generated by software preload or prefetch Instructions that access the L1 I-cache and hit in the L1 I-cache." - }, - { - "ArchStdEvent": "L1I_LFB_HIT_RD", - "PublicDescription": "The Event counts demand Instruction fetches that access the L1 I-cache and hit in a line that is in the process of being loaded into the L1 I-cache." - }, - { - "ArchStdEvent": "L1I_CACHE_REFILL_HWPRF", - "PublicDescription": "The Event counts each hardware prefetch access counted by L1I_CACHE_HWPRF that causes a refill of the Level 1I-cache from outside of the L1 I-cache." - }, - { - "ArchStdEvent": "L1I_CACHE_HIT_RD_FPRF", - "PublicDescription": "The Event counts each demand fetch first hit counted by L1I_CACHE_HIT_RD where the cache line was fetched in response to a software preload or by a hardware prefetcher. That is, the L1I_CACHE_REFILL_PRF Event was generated when the cache line was fetched into the cache. Only the first hit by a demand access is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x0174", - "EventName": "L1I_HWPRF_REQ_DROP", - "PublicDescription": "L1 I-cache hardware prefetch dropped." - }, - { - "EventCode": "0x01ea", - "EventName": "L1I_CFC_ENTRIES", - "PublicDescription": "The Event counts the CFC (Cache Fill Control) entries. The CFC is the fill buffer for I-cache." - }, - { - "EventCode": "0x0228", - "EventName": "L1I_CACHE_HIT_PRFM_FPRF", - "PublicDescription": "L1 I-cache software prefetch access first hit, fetched by hardware or software prefetch. The Event counts each software preload access first hit where the cache line was fetched in response to a hadware prefetcher or software preload Instruction. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x0212", - "EventName": "L1I_CACHE_HIT_HWPRF", - "PublicDescription": "The Event counts each hardware prefetch access that hits an L1 I-cache." - }, - { - "EventCode": "0x022a", - "EventName": "L1I_CACHE_HIT_HWPRF_FPRF", - "PublicDescription": "L1 I-cache hardware prefetch access first hit, fetched by hardware or software prefetch. The Event counts each hardware prefetch access first hit where the cache line was fetched in response to a hardware or prefetch Instruction. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x0215", - "EventName": "L1I_LFB_HIT", - "PublicDescription": "L1 Line fill buffer hit. The Event counts each Demand or software preload or hardware prefetch induced Instruction fetch that hits an L1 I-cache line that is in the process of being loaded into the L1 Instruction, and so does not generate a new refill, but has to wait for the previous refill to complete." - }, - { - "EventCode": "0x0216", - "EventName": "L1I_LFB_HIT_PRFM", - "PublicDescription": "The Event counts each software prefetch access that hits a cache line that is in the process of being loaded into the L1 Instruction, and so does not generate a new refill, but has to wait for the previous refill to complete." - }, - { - "EventCode": "0x0219", - "EventName": "L1I_LFB_HIT_HWPRF", - "PublicDescription": "The Event counts each hardware prefetch access that hits a cache line that is in the process of being loaded into the L1 Instruction, and so does not generate a new refill, but has to wait for the previous refill to complete." - }, - { - "EventCode": "0x0221", - "EventName": "L1I_PRFM_REQ", - "PublicDescription": "L1 I-cache software prefetch requests." - }, - { - "EventCode": "0x0222", - "EventName": "L1I_HWPRF_REQ", - "PublicDescription": "L1 I-cache hardware prefetch requests." - }, - { - "EventCode": "0x01e3", - "EventName": "L1I_CACHE_REFILL_RD", - "PublicDescription": "L1 I-cache refill, Read. The Event counts demand Instruction fetch that causes a refill of the L1 I-cache of this PE, from outside of this cache." - }, - { - "EventCode": "0x01ef", - "EventName": "L1I_CACHE_INVAL", - "PublicDescription": "L1 I-cache invalidate. The Event counts each explicit invalidation of a cache line in the L1 I-cache caused by: * Broadcast cache coherency operations from another CPU in the system. * Invalidation dues to capacity eviction in L2 D-cache. This Event does not count for the following conditions: * A cache refill invalidates a cache line. * A CMO which is executed on that CPU Core and invalidates a cache line specified by Set/Way. * Cache Maintenance Operations (CMO) that operate by a virtual address. Note that * CMOs that operate by Set/Way cannot be broadcast from one CPU Core to another. * The CMO is treated as No-op for the purposes of L1 I-cache line invalidation, as this Core implements fully coherent I-cache." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json deleted file mode 100644 index ac3f8095a9979..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json +++ /dev/null @@ -1,134 +0,0 @@ -[ - { - "ArchStdEvent": "L2D_CACHE", - "PublicDescription": "The Event counts accesses to the L2 cache due to Data accesses. L2 cache is a unified cache for Data and Instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This Event also counts Write back of dirty Data from L1 D-cache to the L2 cache. This CPU includes I-cache accesses in this counter as L2I equivalent Event was not implemented. This Event is the sum of the L2D_CACHE_RD, L2D_CACHE_WR, L2D_CACHE_PRFM, and L2D_CACHE_HWPRF Events." - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL", - "PublicDescription": "The Event counts cache line refills into the L2 cache. L2 cache is a unified cache for Data and Instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This CPU includes I-cache refills in this counter as L2I equivalent Event was not implemented. This Event is the sum of L2D_CACHE_REFILL_RD, L2D_CACHE_REFILL_WR, L2D_CACHE_REFILL_HWPRF, and L2D_CACHE_REFILL_PRFM." - }, - { - "ArchStdEvent": "L2D_CACHE_WB", - "PublicDescription": "The Event counts Write-backs of Data from the L2 cache to outside the CPU. This includes snoops to the L2 (from other CPUs) which return Data even if the snoops cause an invalidation. L2 cache line invalidations which do not write Data outside the CPU and snoops which return Data from an L1 cache are not counted. Data would not be written outside the cache when invalidating a clean cache line. This Event is the sum of the L2D_CACHE_WB_VICTIM and L2D_CACHE_WB_CLEAN Events." - }, - { - "ArchStdEvent": "L2D_CACHE_RD", - "PublicDescription": "The Event counts L2 D-cache accesses due to memory Read operations. L2 cache is a unified cache for Data and Instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This CPU includes I-cache accesses in this counter as L2I equivalent Event was not implemented. This Event is a subset of the L2D_CACHE Event, but this Event only counts memory Read operations." - }, - { - "ArchStdEvent": "L2D_CACHE_WR", - "PublicDescription": "The Event counts L2 cache accesses due to memory Write operations. L2 cache is a unified cache for Data and Instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This Event is a subset of the L2D_CACHE Event, but this Event only counts memory Write operations." - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_RD", - "PublicDescription": "The Event counts refills for memory accesses due to memory Read operation counted by L2D_CACHE_RD. L2 cache is a unified cache for Data and Instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This CPU includes I-cache refills in this counter as L2I equivalent Event was not implemented. This Event is a subset of the L2D_CACHE_REFILL Event. This Event does not count L2 refills caused by stashes into L2. This count includes demand requests that encounter an L2 prefetch request or an L2 software prefetch request to the same cache line, which is still pending in the L2 LFB." - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_WR", - "PublicDescription": "The Event counts refills for memory accesses due to memory Write operation counted by L2D_CACHE_WR. L2 cache is a unified cache for Data and Instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This count includes demand requests that encounter an L2 prefetch request or an L2 software prefetch request to the same cache line, which is still pending in the L2 LFB." - }, - { - "ArchStdEvent": "L2D_CACHE_WB_VICTIM", - "PublicDescription": "The Event counts evictions from the L2 cache because of a line being allocated into the L2 cache. This Event is a subset of the L2D_CACHE_WB Event." - }, - { - "ArchStdEvent": "L2D_CACHE_WB_CLEAN", - "PublicDescription": "The Event counts Write-backs from the L2 cache that are a result of any of the following: * Cache maintenance operations, * Snoop responses or, * Direct cache transfers to another CPU due to a forwarding snoop request. This Event is a subset of the L2D_CACHE_WB Event." - }, - { - "ArchStdEvent": "L2D_CACHE_INVAL", - "PublicDescription": "The Event counts each explicit invalidation of a cache line in the L2 cache by cache maintenance operations that operate by a virtual address, or by external coherency operations. This Event does not count if either: * A cache refill invalidates a cache line, or * A cache Maintenance Operation (CMO), which invalidates a cache line specified by Set/Way, is executed on that CPU. CMOs that operate by Set/Way cannot be broadcast from one CPU to another." - }, - { - "ArchStdEvent": "L2D_CACHE_LMISS_RD", - "PublicDescription": "The Event counts cache line refills into the L2 unified cache from any memory Read operations that incurred additional latency. Counts the same as L2D_CACHE_REFILL_RD in this CPU" - }, - { - "ArchStdEvent": "L2D_CACHE_RW", - "PublicDescription": "The Event counts L2 cache demand accesses from any Load/Store operations. L2 cache is a unified cache for Data and Instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This CPU includes I-cache accesses in this counter as L2I equivalent Event was not implemented. This Event is the sum of the L2D_CACHE_RD and L2D_CACHE_WR Events." - }, - { - "ArchStdEvent": "L2D_CACHE_PRFM", - "PublicDescription": "The Event counts L2 D-cache accesses generated by software preload or prefetch Instructions with target = L1/L2/L3 cache. Note that a software preload or prefetch Instructions with (target = L1/L2/L3) that hits in L1D will not result in an L2 D-cache access. Therefore, such a software preload or prefetch Instructions will not be counted by this Event." - }, - { - "ArchStdEvent": "L2D_CACHE_MISS", - "PublicDescription": "The Event counts cache line misses in the L2 cache. L2 cache is a unified cache for Data and Instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses. Counts same as L2D_CACHE_REFILL_RD in this CPU" - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_PRFM", - "PublicDescription": "The Event counts refills due to accesses generated as a result of software preload or prefetch Instructions as counted by L2D_CACHE_PRFM. This CPU includes I-cache refills in this counter as L2I equivalent Event was not implemented." - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_HWPRF", - "PublicDescription": "The Event counts each hardware prefetch access counted by L2D_CACHE_HWPRF that causes a refill of the L2 cache, or any L1 Data, or Instruction of this PE, from outside of those caches. This does not include prefetch requests pending waiting for a refill in LFB and a new demand request to the same cache line hitting the LFB entry. All such refills are counted as L2D_LFB_HIT_RWL1PRF_FHWPRF." - }, - { - "ArchStdEvent": "L2D_CACHE_REFILL_PRF", - "PublicDescription": "The Event counts each access to L2 Cache due to a prefetch Instruction, or hardware prefetch that causes a refill of the L2 or any Level 1, from outside of those caches." - }, - { - "ArchStdEvent": "L2D_CACHE_HWPRF", - "PublicDescription": "The Event counts the L2 D-cache access caused by L1 or L2 hardware prefetcher." - }, - { - "EventCode": "0x0108", - "EventName": "L2D_CACHE_IF_REFILL", - "PublicDescription": "L2 D-cache refill, Instruction fetch. The Event counts demand Instruction fetch that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x0109", - "EventName": "L2D_CACHE_TBW_REFILL", - "PublicDescription": "L2 D-cache refill, Page table walk. The Event counts demand translation table walk that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x010a", - "EventName": "L2D_CACHE_PF_REFILL", - "PublicDescription": "L2 D-cache refill, prefetch. The Event counts L1 or L2 hardware or software prefetch accesses that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x0201", - "EventName": "L2D_CACHE_BACKSNOOP_L1D_VIRT_ALIASING", - "PublicDescription": "The Event counts when the L2 D-cache sends an invalidating back-snoop to the L1 D for an access initiated by the L1 D, where the corresponding line is already present in the L1 D-cache. The L2 D-cache line tags the PE that refilled the line. It also retains specific bits of the VA to identify virtually aliased addresses. The L1 D request requiring a back-snoop can originate either from the same PE that refilled the L2 D line or from a different PE. In either case, this Event only counts those back snoop where the requested VA mismatch the VA stored in the L2 D tag. This Event is counted only by PE that initiated the original request necessitating a back-snoop. Note : The L1 D is VIPT, it identifies this access as a miss. Conversely, as L2 is PIPT, it identifies this as a hit. L2 D utilizes the back-snoop mechanism to refill L1 D with the snooped Data." - }, - { - "EventCode": "0x0179", - "EventName": "L2D_CACHE_HIT_RWL1PRF_FHWPRF", - "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by hardware prefetch.. The Event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 hardware prefetch. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x020c", - "EventName": "L2D_CACHE_HIT_RWL1PRF_FPRFM", - "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by software prefetch. The Event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 software prefetch. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x020e", - "EventName": "L2D_CACHE_HIT_RWL1PRF_FPRF", - "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by software or hardware prefetch. The Event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 hardware prefetch or software prefetch. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x010b", - "EventName": "L2D_LFB_HIT_RWL1PRF_FHWPRF", - "PublicDescription": "L2 line fill buffer demand Read, demand Write or L1 prefetch first hit, fetched by hardware prefetch. The Event counts each of the following access that hit the line-fill buffer when the same cache line is already being fetched due to an L2 hardware prefetcher. * Demand Read or Write * L1I-HWPRF * L1D-HWPRF * L1I PRFM * L1D PRFM These accesses hit a cache line that is currently being loaded into the L2 cache as a result of a hardware prefetcher to the same line. Consequently, this access does not initiate a new refill but waits for the completion of the previous refill. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x01b9", - "EventName": "L2D_CACHE_REFILL_L1PRF", - "PublicDescription": "L2 D-cache refill, L1 hardware or software prefetch. The Event counts each access counted by L2D_CACHE_L1PRF that causes a refill of the L2 cache or any L1 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x020a", - "EventName": "L2D_CACHE_REFILL_RWL1PRF", - "PublicDescription": "L2 D-cache refill, demand Read, demand Write or L1 hardware or software prefetch. The Event counts each access counted by L2D_CACHE_RWL1PRF that causes a refill of the L2 cache, or any L1 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x01b8", - "EventName": "L2D_CACHE_L1PRF", - "PublicDescription": "L2 D-cache access, L1 hardware or software prefetch. The Event counts L1 Hardware or software prefetch access to L2 D-cache." - }, - { - "EventCode": "0x0208", - "EventName": "L2D_CACHE_RWL1PRF", - "PublicDescription": "L2 D-cache access, demand Read, demand Write or L1 hardware or software prefetch. The Event counts each access to L2 D-cache due to the following: * Demand Read or Write. * L1 Hardware or software prefetch." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json deleted file mode 100644 index 661cbed4ee347..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json +++ /dev/null @@ -1,107 +0,0 @@ -[ - { - "ArchStdEvent": "LL_CACHE_RD", - "PublicDescription": "This is an alias to the Event L3D_CACHE_RD (0x00a0)." - }, - { - "ArchStdEvent": "LL_CACHE_MISS_RD", - "PublicDescription": "This is an alias to the Event L3D_CACHE_REFILL_RD (0x00a2)." - }, - { - "ArchStdEvent": "L3D_CACHE_ALLOCATE", - "PublicDescription": "The Event counts each memory Write operation that writes an entire line into the L3 Data without fetching Data from outside the L3 Data. These are allocations of cache lines in the L3 Data that are not refills counted by L3D_CACHE_REFILL. For example: A Write-back of an entire cache line from an L2 cache to the L3 D-cache. * A Write of an entire cache line from a coalescing Write buffer. * An operation such as DC ZVA. This counter does not count that writes an entire line to beyond level 3. Thus this counter does not count the streaming Writes to beyond L3 cache." - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL", - "PublicDescription": "The Event counts each access counted by L3D_CACHE that causes a refill of the L3 Data,or any L1 Data, Instruction or L2 cache of this PE, from outside of those caches. This includes the refill due to hardware prefetch and software prefetch accesses. This Event is a sum of L3D_CACHE_MISS, L3D_CACHE_REFILL_PRFM and L3D_CACHE_REFILL_HWPRF Event. A refill includes any access that causes Data to be fetched from outside of the L1 to L3 caches, even if the Data is ultimately not allocated into the L3 D-cache." - }, - { - "ArchStdEvent": "L3D_CACHE", - "PublicDescription": "The Event counts each memory Read operation or memory Write operation that causes a cache access to the Level 3. This Event is a sum of the following Events: * L3D_CACHE_RD(0x00a0) * L3D_CACHE_ALLOCATE(0x0029) * L3D_CACHE_PRFM(0x8151) * L3D_CACHE_HWPRF(0x8156) * L2D_CACHE_WB(0x0018)" - }, - { - "ArchStdEvent": "L3D_CACHE_RD", - "PublicDescription": "The Event counts each Memory Read operation to L3 D-cache from Instruction Fetch, Load/Store, and MMU translation table accesses. This does not include HWPRF or PRFM accesses. This include L1 and L2 prefetcher accesses to L3 D-cache." - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL_RD", - "PublicDescription": "The Event counts each access counted by both L3D_CACHE_RD and L3D_CACHE_REFILL. That is, every refill of the L3 cache counted by L3D_CACHE_REFILL that is caused by a Memory Read operation. The L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same Event in the hardware." - }, - { - "ArchStdEvent": "L3D_CACHE_LMISS_RD", - "PublicDescription": "The Event counts each memory Read operation to the L3 cache counted by L3D_CACHE that incurs additional latency because it returns Data from outside of the L1 to L3 caches. The L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same Event in the hardware." - }, - { - "ArchStdEvent": "L3D_CACHE_RW", - "PublicDescription": "The Event counts each access counted by L3D_CACHE that is due to a demand memory Read operation or demand memory Write operation. This Event is a sum of L3D_CACHE_RD(0x00a0), L3D_CACHE_ALLOCATE(0x0029) and L2D_CACHE_WB(0x0018). Note that this counter does not count that writes an entire line to beyond level 3. Thus this counter does not count the streaming Writes to beyond L3 cache." - }, - { - "ArchStdEvent": "L3D_CACHE_PRFM", - "PublicDescription": "The Event counts each access counted by L3D_CACHE that is due to a prefetch Instruction. This includes L3 Data accesses due to the L1, L2, or L3 prefetch Instruction." - }, - { - "ArchStdEvent": "L3D_CACHE_MISS", - "PublicDescription": "The Event counts each demand Read access counted by L3D_CACHE_RD that misses in the L1 to L3 Data, causing an access to outside of the L3 cache. The L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same Event in the hardware." - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL_PRFM", - "PublicDescription": "The Event counts each access counted by L3D_CACHE_PRFM that causes a refill of the L3 cache, or any L1 or L2 Data, from outside of those caches." - }, - { - "ArchStdEvent": "L3D_CACHE_HWPRF", - "PublicDescription": "The Event counts each access to L3 cache that is due to a hardware prefetcher. This includes L3D accesses due to the Level-1 or Level-2 or Level-3 hardware prefetcher." - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL_HWPRF", - "PublicDescription": "The Event counts each hardware prefetch counted by L3D_CACHE_HWPRF that causes a refill of the L3 Data or unified cache, or any L1 or L2 Data, Instruction, or unified cache of this PE, from outside of those caches." - }, - { - "ArchStdEvent": "L3D_CACHE_REFILL_PRF", - "PublicDescription": "The Event counts each access to L3 cache due to a prefetch Instruction, or hardware prefetch that causes a refill of the L3 Data, or any L1 or L2 Data, from outside of those caches." - }, - { - "EventCode": "0x01f7", - "EventName": "L3D_CACHE_HIT_RWL1PRFL2PRF_FPRF", - "PublicDescription": "L3 cache demand Read, demand Write , L1 prefetch L2 prefetch first hit, fetched by software or hardware prefetch. The Event counts each demand Read, demand Write , L1 hardware or software prefetch request and L2 hardware or software prefetch that hit an L3 D-cache line that was refilled into L3 D-cache in response to an L3 hardware prefetch or software prefetch. Only the first hit is counted. After this Event is generated for a cache line, the Event is not generated again for the same cache line while it remains in the cache." - }, - { - "EventCode": "0x01e9", - "EventName": "L3D_CACHE_REFILL_RWL1PRFL2PRF", - "PublicDescription": "L3 cache refill, demand Read, demand Write, L1 hardware or software prefetch or L2 hardware or software prefetch. The Event counts each access counted by L3D_CACHE_RWL1PRFL2PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x01e8", - "EventName": "L3D_CACHE_RWL1PRFL2PRF", - "PublicDescription": "L3 cache access, demand Read, demand Write, L1 hardware or software prefetch or L2 hardware or software prefetch. The Event counts each access to L3 D-cache due to the following: * Demand Read or Write. * L1 Hardware or software prefetch. * L2 Hardware or software prefetch." - }, - { - "EventCode": "0x0225", - "EventName": "L3D_CACHE_REFILL_IF", - "PublicDescription": "L3 cache refill, Instruction fetch. The Event counts demand Instruction fetch that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x0226", - "EventName": "L3D_CACHE_REFILL_MM", - "PublicDescription": "L3 cache refill, translation table walk access. The Event counts demand translation table access that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x0227", - "EventName": "L3D_CACHE_REFILL_L1PRF", - "PublicDescription": "The Event counts each access counted by L3D_CACHE_L1PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x01f6", - "EventName": "L3D_CACHE_REFILL_L2PRF", - "PublicDescription": "The Event counts each access counted by L3D_CACHE_L2PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." - }, - { - "EventCode": "0x022c", - "EventName": "L3D_CACHE_L1PRF", - "PublicDescription": "The Event counts the L3 D-cache access due to L1 hardware prefetch of software prefetch request. The L1 hardware prefetch or software prefetch request that miss the L1I, L1D and L2 D-cache are counted by this counter" - }, - { - "EventCode": "0x022d", - "EventName": "L3D_CACHE_L2PRF", - "PublicDescription": "The Event counts the L3 D-cache access due to L2 hardware prefetch of software prefetch request. The L2 hardware prefetch or software prefetch request that miss the L2 D-cache are counted by this counter" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json deleted file mode 100644 index 64fced85a9881..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "ArchStdEvent": "MEM_ACCESS", - "PublicDescription": "The Event counts memory accesses issued by the CPU Load/Store unit, where those accesses are issued due to Load or Store operations. This Event counts memory accesses no matter whether the Data is received from any level of cache hierarchy or external memory. If memory accesses are broken up into smaller transactions than what were specified in the Load or Store Instructions, then the Event counts those smaller memory transactions. Memory accesses generated by the following Instructions or activity are not counted: Instruction fetches, cache maintenance Instructions, Translation table walks or prefetches, Memory prefetch operations. This Event counts the sum of the MEM_ACCESS_RD and MEM_ACCESS_WR Events." - }, - { - "ArchStdEvent": "MEMORY_ERROR", - "PublicDescription": "The Event counts any detected correctable or uncorrectable physical memory errors (ECC or parity) in protected CPUs RAMs. On the Core, this Event counts errors in the caches (including Data and tag rams). Any detected memory error (from either a speculative and abandoned access, or an architecturally executed access) is counted. Note that errors are only detected when the actual protected memory is accessed by an operation." - }, - { - "ArchStdEvent": "REMOTE_ACCESS", - "PublicDescription": "Counter counts each external bus Read access that causes an access to a remote device. That is, a socket that does not contain the PE." - }, - { - "ArchStdEvent": "MEM_ACCESS_RD", - "PublicDescription": "The Event counts memory accesses issued by the CPU due to Load operations. The Event counts any memory Load access, no matter whether the Data is received from any level of cache hierarchy or external memory. The Event also counts atomic Load operations. If memory accesses are broken up by the Load/Store unit into smaller transactions that are issued by the bus interface, then the Event counts those smaller transactions. The following Instructions are not counted: 1) Instruction fetches, 2) Cache maintenance Instructions, 3) Translation table walks or prefetches, 4) Memory prefetch operations. This Event is a subset of the MEM_ACCESS Event but the Event only counts memory-Read operations." - }, - { - "ArchStdEvent": "MEM_ACCESS_WR", - "PublicDescription": "The Event counts memory accesses issued by the CPU due to Store operations. The Event counts any memory Store access, no matter whether the Data is located in any level of cache or external memory. The Event also counts atomic Load and Store operations. If memory accesses are broken up by the Load/Store unit into smaller transactions that are issued by the bus interface, then the Event counts those smaller transactions." - }, - { - "ArchStdEvent": "LDST_ALIGN_LAT", - "PublicDescription": "The Event counts the number of memory Read and Write accesses in a cycle that incurred additional latency, due to the alignment of the address and the size of Data being accessed, which results in Store crossing a single cache line. This Event is implemented as the sum of LD_ALIGN_LAT and ST_ALIGN_LAT on this CPU." - }, - { - "ArchStdEvent": "LD_ALIGN_LAT", - "PublicDescription": "The Event counts the number of memory Read accesses in a cycle that incurred additional latency, due to the alignment of the address and size of Data being accessed, which results in Load crossing a single cache line." - }, - { - "ArchStdEvent": "ST_ALIGN_LAT", - "PublicDescription": "The Event counts the number of memory Write access in a cycle that incurred additional latency, due to the alignment of the address and size of Data being accessed incurred additional latency." - }, - { - "ArchStdEvent": "INST_FETCH_PERCYC", - "PublicDescription": "The Event counts number of Instruction fetches outstanding per cycle, which will provide an average latency of Instruction fetch." - }, - { - "ArchStdEvent": "MEM_ACCESS_RD_PERCYC", - "PublicDescription": "The Event counts the number of outstanding Loads or memory Read accesses per cycle." - }, - { - "ArchStdEvent": "INST_FETCH", - "PublicDescription": "The Event counts Instruction memory accesses that the PE makes." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json deleted file mode 100644 index 18c2fd58ee9ec..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json +++ /dev/null @@ -1,722 +0,0 @@ -[ - { - "MetricName": "backend_bound", - "MetricExpr": "100 * (STALL_SLOT_BACKEND / CPU_SLOT)", - "BriefDescription": "This metric is the percentage of total slots that were stalled due to resource constraints in the backend of the processor.", - "ScaleUnit": "1percent of slots", - "MetricGroup": "TopdownL1" - }, - { - "MetricName": "backend_busy_bound", - "MetricExpr": "100 * (STALL_BACKEND_BUSY / STALL_BACKEND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to issue queues being full to accept operations for execution.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_cache_l1d_bound", - "MetricExpr": "100 * (STALL_BACKEND_L1D / (STALL_BACKEND_L1D + STALL_BACKEND_MEM))", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by L1 D-cache misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_cache_l2d_bound", - "MetricExpr": "100 * (STALL_BACKEND_MEM / (STALL_BACKEND_L1D + STALL_BACKEND_MEM))", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by L2 D-cache misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_core_bound", - "MetricExpr": "100 * (STALL_BACKEND_CPUBOUND / STALL_BACKEND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to backend Core resource constraints not related to Instruction fetch latency issues caused by memory access components.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_core_rename_bound", - "MetricExpr": "100 * (STALL_BACKEND_RENAME / STALL_BACKEND_CPUBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend as the rename unit registers are unavailable.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_mem_bound", - "MetricExpr": "100 * (STALL_BACKEND_MEMBOUND / STALL_BACKEND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to backend Core resource constraints related to memory access latency issues caused by memory access components.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_mem_cache_bound", - "MetricExpr": "100 * ((STALL_BACKEND_L1D + STALL_BACKEND_MEM) / STALL_BACKEND_MEMBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory latency issues caused by D-cache misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_mem_store_bound", - "MetricExpr": "100 * (STALL_BACKEND_ST / STALL_BACKEND_MEMBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory Write pending caused by Stores stalled in the pre-commit stage.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_mem_tlb_bound", - "MetricExpr": "100 * (STALL_BACKEND_TLB / STALL_BACKEND_MEMBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by Data TLB misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Backend" - }, - { - "MetricName": "backend_stalled_cycles", - "MetricExpr": "100 * (STALL_BACKEND / CPU_CYCLES)", - "BriefDescription": "This metric is the percentage of cycles that were stalled due to resource constraints in the backend unit of the processor.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Cycle_Accounting" - }, - { - "MetricName": "bad_speculation", - "MetricExpr": "100 - (frontend_bound + retiring + backend_bound)", - "BriefDescription": "This metric is the percentage of total slots that executed operations and didn't retire due to a pipeline flush. This indicates cycles that were utilized but inefficiently.", - "ScaleUnit": "1percent of slots", - "MetricGroup": "TopdownL1" - }, - { - "MetricName": "bus_bandwidth", - "MetricExpr": "BUS_ACCESS * 32 / duration_time ", - "BriefDescription": "This metrics measures the bus-bandwidth of the Data transferred between this PE's L2 with unCore in the system", - "ScaleUnit": "1Bytes/sec" - }, - { - "MetricName": "barrier_percentage", - "MetricExpr": "100 * ((ISB_SPEC + DSB_SPEC + DMB_SPEC) / INST_SPEC)", - "BriefDescription": "This metric measures Instruction and Data barrier operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "branch_direct_ratio", - "MetricExpr": "BR_IMMED_RETIRED / BR_RETIRED", - "BriefDescription": "This metric measures the ratio of direct branches retired to the total number of Branches architecturally executed.", - "ScaleUnit": "1per branch", - "MetricGroup": "Branch_Effectiveness" - }, - { - "MetricName": "branch_indirect_ratio", - "MetricExpr": "BR_IND_RETIRED / BR_RETIRED", - "BriefDescription": "This metric measures the ratio of indirect Branches retired, including function returns, to the total number of Branches architecturally executed.", - "ScaleUnit": "1per branch", - "MetricGroup": "Branch_Effectiveness" - }, - { - "MetricName": "branch_misprediction_ratio", - "MetricExpr": "BR_MIS_PRED_RETIRED / BR_RETIRED", - "BriefDescription": "This metric measures the ratio of Branches mispredicted to the total number of Branches architecturally executed. This gives an indication of the effectiveness of the Branch prediction unit.", - "ScaleUnit": "1per branch", - "MetricGroup": "Miss_Ratio;Branch_Effectiveness" - }, - { - "MetricName": "branch_mpki", - "MetricExpr": "1000 * (BR_MIS_PRED_RETIRED / INST_RETIRED)", - "BriefDescription": "This metric measures the number of Branch mispredictions per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;Branch_Effectiveness" - }, - { - "MetricName": "branch_percentage", - "MetricExpr": "100 * ((BR_IMMED_SPEC + BR_INDIRECT_SPEC) / INST_SPEC)", - "BriefDescription": "This metric measures Branch operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "branch_return_ratio", - "MetricExpr": "BR_RETURN_RETIRED / BR_RETIRED", - "BriefDescription": "This metric measures the ratio of Branches retired that are function returns to the total number of Branches architecturally executed.", - "ScaleUnit": "1per branch", - "MetricGroup": "Branch_Effectiveness" - }, - { - "MetricName": "cpu_cycles_fraction_in_st_mode", - "MetricExpr": "((CPU_SLOT/CPU_CYCLES) - 5) / 5", - "BriefDescription": "This metric counts fraction of the CPU cycles spent in ST mode during program execution.", - "ScaleUnit": "1fraction of cycles", - "MetricGroup": "SMT" - }, - { - "MetricName": "cpu_cycles_in_smt_mode", - "MetricExpr": "(1 - cpu_cycles_fraction_in_st_mode) * CPU_CYCLES", - "BriefDescription": "This metric counts CPU cycles in SMT mode during program execution.", - "ScaleUnit": "1CPU cycles", - "MetricGroup": "SMT" - }, - { - "MetricName": "cpu_cycles_in_st_mode", - "MetricExpr": "cpu_cycles_fraction_in_st_mode * CPU_CYCLES", - "BriefDescription": "This metric counts CPU cycles in ST mode during program execution.", - "ScaleUnit": "1CPU cycles", - "MetricGroup": "SMT" - }, - { - "MetricName": "crypto_percentage", - "MetricExpr": "100 * (CRYPTO_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures crypto operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "dtlb_mpki", - "MetricExpr": "1000 * (DTLB_WALK / INST_RETIRED)", - "BriefDescription": "This metric measures the number of Data TLB Walks per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;DTLB_Effectiveness" - }, - { - "MetricName": "dtlb_walk_average_latency", - "MetricExpr": "DTLB_WALK_PERCYC / DTLB_WALK", - "BriefDescription": "This metric measures the average latency of Data TLB walks in CPU cycles", - "ScaleUnit": "1CPU cycles", - "MetricGroup": "Average_Latency" - }, - { - "MetricName": "dtlb_walk_ratio", - "MetricExpr": "DTLB_WALK / L1D_TLB", - "BriefDescription": "This metric measures the ratio of Data TLB Walks to the total number of Data TLB accesses. This gives an indication of the effectiveness of the Data TLB accesses.", - "ScaleUnit": "1per TLB access", - "MetricGroup": "Miss_Ratio;DTLB_Effectiveness" - }, - { - "MetricName": "fp_ops_per_cycle", - "MetricExpr": "(FP_SCALE_OPS_SPEC + FP_FIXED_OPS_SPEC) / CPU_CYCLES", - "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by any Instruction. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.", - "ScaleUnit": "1operations per cycle", - "MetricGroup": "FP_Arithmetic_Intensity" - }, - { - "MetricName": "fp16_percentage", - "MetricExpr": "100 * (FP_HP_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures half-precision floating point operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "FP_Precision_Mix" - }, - { - "MetricName": "fp32_percentage", - "MetricExpr": "100 * (FP_SP_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures single-precision floating point operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "FP_Precision_Mix" - }, - { - "MetricName": "fp64_percentage", - "MetricExpr": "100 * (FP_DP_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures double-precision floating point operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "FP_Precision_Mix" - }, - { - "MetricName": "frontend_bound", - "MetricExpr": "100 * (STALL_SLOT_FRONTEND_WITHOUT_MISPRED / CPU_SLOT)", - "BriefDescription": "This metric is the percentage of total slots that were stalled due to resource constraints in the frontend of the processor.", - "ScaleUnit": "1percent of slots", - "MetricGroup": "TopdownL1" - }, - { - "MetricName": "frontend_cache_l1i_bound", - "MetricExpr": "100 * (STALL_FRONTEND_L1I / (STALL_FRONTEND_L1I + STALL_FRONTEND_MEM))", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to memory access latency issues caused by L1 I-cache misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_cache_l2i_bound", - "MetricExpr": "100 * (STALL_FRONTEND_MEM / (STALL_FRONTEND_L1I + STALL_FRONTEND_MEM))", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to memory access latency issues caused by L2 I-cache misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_core_bound", - "MetricExpr": "100 * (STALL_FRONTEND_CPUBOUND / STALL_FRONTEND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to frontend Core resource constraints not related to Instruction fetch latency issues caused by memory access components.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_core_flow_bound", - "MetricExpr": "100 * (STALL_FRONTEND_FLOW / STALL_FRONTEND_CPUBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend as the decode unit is awaiting input from the Branch prediction unit.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_core_flush_bound", - "MetricExpr": "100 * (STALL_FRONTEND_FLUSH / STALL_FRONTEND_CPUBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend as the processor is recovering from a pipeline flush caused by bad speculation or other machine resteers.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_mem_bound", - "MetricExpr": "100 * (STALL_FRONTEND_MEMBOUND / STALL_FRONTEND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to frontend Core resource constraints related to the Instruction fetch latency issues caused by memory access components.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_mem_cache_bound", - "MetricExpr": "100 * ((STALL_FRONTEND_L1I + STALL_FRONTEND_MEM) / STALL_FRONTEND_MEMBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to Instruction fetch latency issues caused by I-cache misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_mem_tlb_bound", - "MetricExpr": "100 * (STALL_FRONTEND_TLB / STALL_FRONTEND_MEMBOUND)", - "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to Instruction fetch latency issues caused by Instruction TLB misses.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Topdown_Frontend" - }, - { - "MetricName": "frontend_stalled_cycles", - "MetricExpr": "100 * (STALL_FRONTEND / CPU_CYCLES)", - "BriefDescription": "This metric is the percentage of cycles that were stalled due to resource constraints in the frontend unit of the processor.", - "ScaleUnit": "1percent of cycles", - "MetricGroup": "Cycle_Accounting" - }, - { - "MetricName": "instruction_fetch_average_latency", - "MetricExpr": "INST_FETCH_PERCYC / INST_FETCH", - "BriefDescription": "This metric measures the average latency of Instruction fetches in CPU cycles", - "ScaleUnit": "1CPU cycles", - "MetricGroup": "Average_Latency" - }, - { - "MetricName": "integer_dp_percentage", - "MetricExpr": "100 * (DP_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures scalar integer operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "ipc", - "MetricExpr": "INST_RETIRED / CPU_CYCLES", - "BriefDescription": "This metric measures the number of Instructions retired per cycle.", - "ScaleUnit": "1per cycle", - "MetricGroup": "General" - }, - { - "MetricName": "itlb_mpki", - "MetricExpr": "1000 * (ITLB_WALK / INST_RETIRED)", - "BriefDescription": "This metric measures the number of Instruction TLB Walks per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;ITLB_Effectiveness" - }, - { - "MetricName": "itlb_walk_average_latency", - "MetricExpr": "ITLB_WALK_PERCYC / ITLB_WALK", - "BriefDescription": "This metric measures the average latency of Instruction TLB walks in CPU cycles", - "ScaleUnit": "1CPU cycles", - "MetricGroup": "Average_Latency" - }, - { - "MetricName": "itlb_walk_ratio", - "MetricExpr": "ITLB_WALK / L1I_TLB", - "BriefDescription": "This metric measures the ratio of Instruction TLB Walks to the total number of Instruction TLB accesses. This gives an indication of the effectiveness of the Instruction TLB accesses.", - "ScaleUnit": "1per TLB access", - "MetricGroup": "Miss_Ratio;ITLB_Effectiveness" - }, - { - "MetricName": "l1d_cache_miss_ratio", - "MetricExpr": "L1D_CACHE_REFILL / L1D_CACHE", - "BriefDescription": "This metric measures the ratio of L1 D-cache accesses missed to the total number of L1 D-cache accesses. This gives an indication of the effectiveness of the L1 D-cache.", - "ScaleUnit": "1per cache access", - "MetricGroup": "Miss_Ratio;L1D_Cache_Effectiveness" - }, - { - "MetricName": "l1d_cache_mpki", - "MetricExpr": "1000 * (L1D_CACHE_REFILL / INST_RETIRED)", - "BriefDescription": "This metric measures the number of L1 D-cache accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;L1D_Cache_Effectiveness" - }, - { - "MetricName": "l1d_cache_rw_miss_ratio", - "MetricExpr": "l1d_demand_misses / l1d_demand_accesses", - "BriefDescription": "This metric measures the ratio of L1 D-cache Read accesses missed to the total number of L1 D-cache accesses. This gives an indication of the effectiveness of the L1 D-cache for demand Load or Store traffic.", - "ScaleUnit": "1per cache access", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1d_demand_accesses", - "MetricExpr": "L1D_CACHE_RW", - "BriefDescription": "This metric measures the count of L1 D-cache accesses incurred on Load or Store by the Instructions stream of the program", - "ScaleUnit": "1count", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1d_demand_misses", - "MetricExpr": "L1D_CACHE_REFILL_RW", - "BriefDescription": "This metric measures the count of L1 D-cache misses incurred on a Load or Store by the Instructions stream of the program", - "ScaleUnit": "1count", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1d_prf_accuracy", - "MetricExpr": "100 * (l1d_useful_prf / l1d_refilled_prf)", - "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the Instruction stream", - "ScaleUnit": "1percent of prefetch", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1d_prf_coverage", - "MetricExpr": "100 * (l1d_useful_prf / (l1d_demand_misses + l1d_refilled_prf))", - "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", - "ScaleUnit": "1percent of cache access", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1d_refilled_prf", - "MetricExpr": "L1D_CACHE_REFILL_HWPRF + L1D_CACHE_REFILL_PRFM + L1D_LFB_HIT_RW_FHWPRF + L1D_LFB_HIT_RW_FPRFM", - "BriefDescription": "This metric measures the count of cache lines refilled by L1 Data prefetcher (hardware prefetches or software preload) into L1 D-cache.", - "ScaleUnit": "1count", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1d_tlb_miss_ratio", - "MetricExpr": "L1D_TLB_REFILL / L1D_TLB", - "BriefDescription": "This metric measures the ratio of L1 Data TLB accesses missed to the total number of L1 Data TLB accesses. This gives an indication of the effectiveness of the L1 Data TLB.", - "ScaleUnit": "1per TLB access", - "MetricGroup": "Miss_Ratio;DTLB_Effectiveness" - }, - { - "MetricName": "l1d_tlb_mpki", - "MetricExpr": "1000 * (L1D_TLB_REFILL / INST_RETIRED)", - "BriefDescription": "This metric measures the number of L1 Data TLB accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;DTLB_Effectiveness" - }, - { - "MetricName": "l1d_useful_prf", - "MetricExpr": "L1D_CACHE_HIT_RW_FPRF + L1D_LFB_HIT_RW_FHWPRF + L1D_LFB_HIT_RW_FPRFM", - "BriefDescription": "This metric measures the count of cache lines refilled by L1 Data prefetcher (hardware prefetches or software preload) into L1 D-cache which are further used by Load or Store from the Instruction stream of the program.", - "ScaleUnit": "1count", - "MetricGroup": "L1I_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_cache_miss_ratio", - "MetricExpr": "L1I_CACHE_REFILL / L1I_CACHE", - "BriefDescription": "This metric measures the ratio of L1 I-cache accesses missed to the total number of L1 I-cache accesses. This gives an indication of the effectiveness of the L1 I-cache.", - "ScaleUnit": "1per cache access", - "MetricGroup": "Miss_Ratio;L1I_Cache_Effectiveness" - }, - { - "MetricName": "l1i_cache_mpki", - "MetricExpr": "1000 * (L1I_CACHE_REFILL / INST_RETIRED)", - "BriefDescription": "This metric measures the number of L1 I-cache accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;L1I_Cache_Effectiveness" - }, - { - "MetricName": "l1i_cache_rd_miss_ratio", - "MetricExpr": "l1i_demand_misses / l1i_demand_accesses", - "BriefDescription": "This metric measures the ratio of L1 I-cache Read accesses missed to the total number of L1 I-cache accesses. This gives an indication of the effectiveness of the L1 I-cache for demand Instruction fetch traffic. Note that cache accesses in this cache is demand Instruction fetch.", - "ScaleUnit": "1per cache access", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_demand_accesses", - "MetricExpr": "L1I_CACHE_RD", - "BriefDescription": "This metric measures the count of L1 I-cache accesses caused by an Instruction fetch by the Instructions stream of the program", - "ScaleUnit": "1Count", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_demand_misses", - "MetricExpr": "L1I_CACHE_REFILL_RD", - "BriefDescription": "This metric measures the count of L1 I-cache misses caused by an Instruction fetch by the Instructions stream of the program", - "ScaleUnit": "1Count", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_prf_accuracy", - "MetricExpr": "100 * (l1i_useful_prf / l1i_refilled_prf)", - "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the Instruction stream", - "ScaleUnit": "1percent of prefetch", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_prf_coverage", - "MetricExpr": "100 * (l1i_useful_prf / (l1i_demand_misses + l1i_refilled_prf))", - "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", - "ScaleUnit": "1percent of cache access", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_refilled_prf", - "MetricExpr": "L1I_CACHE_REFILL_HWPRF + L1I_CACHE_REFILL_PRFM", - "BriefDescription": "This metric measures the count of cache lines refilled by L1 Instruction prefetcher (hardware prefetches or software preload) into L1 I-cache.", - "ScaleUnit": "1count", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l1i_tlb_miss_ratio", - "MetricExpr": "L1I_TLB_REFILL / L1I_TLB", - "BriefDescription": "This metric measures the ratio of L1 Instruction TLB accesses missed to the total number of L1 Instruction TLB accesses. This gives an indication of the effectiveness of the L1 Instruction TLB.", - "ScaleUnit": "1per TLB access", - "MetricGroup": "Miss_Ratio;ITLB_Effectiveness" - }, - { - "MetricName": "l1i_tlb_mpki", - "MetricExpr": "1000 * (L1I_TLB_REFILL / INST_RETIRED)", - "BriefDescription": "This metric measures the number of L1 Instruction TLB accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;ITLB_Effectiveness" - }, - { - "MetricName": "l1i_useful_prf", - "MetricExpr": "L1I_CACHE_HIT_RD_FPRF", - "BriefDescription": "This metric measures the count of cache lines refilled by L1 Instruction prefetcher (hardware prefetches or software preload) into L1 I-cache which are further used by Instruction stream of the program.", - "ScaleUnit": "1count", - "MetricGroup": "L1D_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2_cache_miss_ratio", - "MetricExpr": "L2D_CACHE_REFILL / L2D_CACHE", - "BriefDescription": "This metric measures the ratio of L2 cache accesses missed to the total number of L2 cache accesses. This gives an indication of the effectiveness of the L2 cache, which is a unified cache that stores both Data and Instruction.", - "ScaleUnit": "1per cache access", - "MetricGroup": "Miss_Ratio;L2_Cache_Effectiveness" - }, - { - "MetricName": "l2_cache_mpki", - "MetricExpr": "1000 * (L2D_CACHE_REFILL / INST_RETIRED)", - "BriefDescription": "This metric measures the number of L2 unified cache accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;L2_Cache_Effectiveness" - }, - { - "MetricName": "l2_tlb_miss_ratio", - "MetricExpr": "L2D_TLB_REFILL / L2D_TLB", - "BriefDescription": "This metric measures the ratio of L2 unified TLB accesses missed to the total number of L2 unified TLB accesses.", - "ScaleUnit": "1per TLB access", - "MetricGroup": "Miss_Ratio;ITLB_Effectiveness;DTLB_Effectiveness" - }, - { - "MetricName": "l2_tlb_mpki", - "MetricExpr": "1000 * (L2D_TLB_REFILL / INST_RETIRED)", - "BriefDescription": "This metric measures the number of L2 unified TLB accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;ITLB_Effectiveness;DTLB_Effectiveness" - }, - { - "MetricName": "l2d_cache_rwl1prf_miss_ratio", - "MetricExpr": "l2d_demand_misses / l2d_demand_accesses", - "BriefDescription": "This metric measures the ratio of L2 D-cache Read accesses missed to the total number of L2 D-cache accesses.", - "ScaleUnit": "1per cache access", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2d_demand_accesses", - "MetricExpr": "L2D_CACHE_RD + L2D_CACHE_WR + L2D_CACHE_L1PRF", - "BriefDescription": "This metric measures the count of L2 D-cache accesses incurred on an Instruction Fetch, Load, Store, or L1 prefetcher accesses by the Instructions stream of the program", - "ScaleUnit": "1count", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2d_demand_misses", - "MetricExpr": "L2D_CACHE_REFILL_RD + L2D_CACHE_REFILL_WR + L2D_CACHE_REFILL_L1PRF", - "BriefDescription": "This metric measures the count of L2 D-cache misses incurred on an Instruction Fetch, Load, Store, or L1 prefetcher accesses by the Instructions stream of the program", - "ScaleUnit": "1count", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2d_prf_accuracy", - "MetricExpr": "100 * (l2d_useful_prf / l2d_refilled_prf)", - "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the Instruction stream", - "ScaleUnit": "1percent of prefetch", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2d_prf_coverage", - "MetricExpr": "100 * (l2d_useful_prf / (l2d_demand_misses + l2d_refilled_prf))", - "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", - "ScaleUnit": "1percent of cache access", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2d_refilled_prf", - "MetricExpr": "(L2D_CACHE_REFILL_PRF - L2D_CACHE_REFILL_L1PRF) + L2D_LFB_HIT_RWL1PRF_FHWPRF", - "BriefDescription": "This metric measures the count of cache lines refilled by L2 Data prefetcher (hardware prefetches or software preload) into L2 D-cache.", - "ScaleUnit": "1count", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l2d_useful_prf", - "MetricExpr": "L2D_CACHE_HIT_RWL1PRF_FPRF + L2D_LFB_HIT_RWL1PRF_FHWPRF", - "BriefDescription": "This metric measures the count of cache lines refilled by L2 Data prefetcher (hardware prefetches or software preload) into L2 D-cache which are further used by Instruction Fetch, Load, Store, or L1 prefetcher accesses from the Instruction stream of the program.", - "ScaleUnit": "1count", - "MetricGroup": "L2_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_cache_rwl1prfl2prf_miss_ratio", - "MetricExpr": "l3d_demand_misses / l3d_demand_accesses", - "BriefDescription": "This metric measures the ratio of L3 D-cache Read accesses missed to the total number of L3 D-cache accesses. This gives an indication of the effectiveness of the L2 D-cache for demand Instruction Fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses traffic.", - "ScaleUnit": "1per cache access", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_demand_accesses", - "MetricExpr": "L3D_CACHE_RWL1PRFL2PRF", - "BriefDescription": "This metric measures the count of L3 D-cache accesses incurred on an Instruction Fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses by the Instructions stream of the program", - "ScaleUnit": "1count", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_demand_misses", - "MetricExpr": "L3D_CACHE_REFILL_RWL1PRFL2PRF", - "BriefDescription": "This metric measures the count of L3 D-cache misses incurred on an Instruction Fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses by the Instructions stream of the program", - "ScaleUnit": "1count", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_prf_accuracy", - "MetricExpr": "100 * (l3d_useful_prf / l3d_refilled_prf)", - "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the Instruction stream", - "ScaleUnit": "1percent of prefetch", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_prf_coverage", - "MetricExpr": "100 * (l3d_useful_prf / (l3d_demand_misses + l3d_refilled_prf))", - "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", - "ScaleUnit": "1percent of cache access", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_refilled_prf", - "MetricExpr": "L3D_CACHE_REFILL_HWPRF + L3D_CACHE_REFILL_PRFM - L3D_CACHE_REFILL_L1PRF - L3D_CACHE_REFILL_L2PRF", - "BriefDescription": "This metric measures the count of cache lines refilled by L3 Data prefetcher (hardware prefetches or software preload) into L3 D-cache.", - "ScaleUnit": "1count", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "l3d_useful_prf", - "MetricExpr": "L3D_CACHE_HIT_RWL1PRFL2PRF_FPRF", - "BriefDescription": "This metric measures the count of cache lines refilled by L3 Data prefetcher (hardware prefetches or software preload) into L3 D-cache which are further used by Instruction Fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses from the Instruction stream of the program.", - "ScaleUnit": "1count", - "MetricGroup": "L3_Prefetcher_Effectiveness" - }, - { - "MetricName": "ll_cache_read_hit_ratio", - "MetricExpr": "(LL_CACHE_RD - LL_CACHE_MISS_RD) / LL_CACHE_RD", - "BriefDescription": "This metric measures the ratio of last level cache Read accesses hit in the cache to the total number of last level cache accesses. This gives an indication of the effectiveness of the last level cache for Read traffic. Note that cache accesses in this cache are either Data memory access or Instruction fetch as this is a system level cache.", - "ScaleUnit": "1per cache access", - "MetricGroup": "LL_Cache_Effectiveness" - }, - { - "MetricName": "ll_cache_read_miss_ratio", - "MetricExpr": "LL_CACHE_MISS_RD / LL_CACHE_RD", - "BriefDescription": "This metric measures the ratio of last level cache Read accesses missed to the total number of last level cache accesses. This gives an indication of the effectiveness of the last level cache for Read traffic. Note that cache accesses in this cache are either Data memory access or Instruction fetch as this is a system level cache.", - "ScaleUnit": "1per cache access", - "MetricGroup": "Miss_Ratio;LL_Cache_Effectiveness" - }, - { - "MetricName": "ll_cache_read_mpki", - "MetricExpr": "1000 * (LL_CACHE_MISS_RD / INST_RETIRED)", - "BriefDescription": "This metric measures the number of last level cache Read accesses missed per thousand Instructions executed.", - "ScaleUnit": "1MPKI", - "MetricGroup": "MPKI;LL_Cache_Effectiveness" - }, - { - "MetricName": "load_average_latency", - "MetricExpr": "MEM_ACCESS_RD_PERCYC / MEM_ACCESS", - "BriefDescription": "This metric measures the average latency of Load operations in CPU cycles", - "ScaleUnit": "1CPU cycles", - "MetricGroup": "Average_Latency" - }, - { - "MetricName": "load_percentage", - "MetricExpr": "100 * (LD_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures Load operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "nonsve_fp_ops_per_cycle", - "MetricExpr": "FP_FIXED_OPS_SPEC / CPU_CYCLES", - "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by an Instruction that is not an SVE Instruction. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.", - "ScaleUnit": "1operations per cycle", - "MetricGroup": "FP_Arithmetic_Intensity" - }, - { - "MetricName": "retiring", - "MetricExpr": "100 * ((OP_RETIRED/OP_SPEC) * (1 - (STALL_SLOT/CPU_SLOT)))", - "BriefDescription": "This metric is the percentage of total slots that retired operations, which indicates cycles that were utilized efficiently.", - "ScaleUnit": "1percent of slots", - "MetricGroup": "TopdownL1" - }, - { - "MetricName": "scalar_fp_percentage", - "MetricExpr": "100 * (VFP_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures scalar floating point operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "simd_percentage", - "MetricExpr": "100 * (ASE_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures advanced SIMD operations as a percentage of total operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "store_percentage", - "MetricExpr": "100 * (ST_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures Store operations as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "sve_all_percentage", - "MetricExpr": "100 * (SVE_INST_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures scalable vector operations, including Loads and Stores, as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "Operation_Mix" - }, - { - "MetricName": "sve_fp_ops_per_cycle", - "MetricExpr": "FP_SCALE_OPS_SPEC / CPU_CYCLES", - "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by SVE Instructions. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.", - "ScaleUnit": "1operations per cycle", - "MetricGroup": "FP_Arithmetic_Intensity" - }, - { - "MetricName": "sve_predicate_empty_percentage", - "MetricExpr": "100 * (SVE_PRED_EMPTY_SPEC / SVE_PRED_SPEC)", - "BriefDescription": "This metric measures scalable vector operations with no active predicates as a percentage of sve predicated operations speculatively executed.", - "ScaleUnit": "1percent of SVE predicated operations", - "MetricGroup": "SVE_Effectiveness" - }, - { - "MetricName": "sve_predicate_full_percentage", - "MetricExpr": "100 * (SVE_PRED_FULL_SPEC / SVE_PRED_SPEC)", - "BriefDescription": "This metric measures scalable vector operations with all active predicates as a percentage of sve predicated operations speculatively executed.", - "ScaleUnit": "1percent of SVE predicated operations", - "MetricGroup": "SVE_Effectiveness" - }, - { - "MetricName": "sve_predicate_partial_percentage", - "MetricExpr": "100 * (SVE_PRED_PARTIAL_SPEC / SVE_PRED_SPEC)", - "BriefDescription": "This metric measures scalable vector operations with at least one active predicates as a percentage of sve predicated operations speculatively executed.", - "ScaleUnit": "1percent of SVE predicated operations", - "MetricGroup": "SVE_Effectiveness" - }, - { - "MetricName": "sve_predicate_percentage", - "MetricExpr": "100 * (SVE_PRED_SPEC / INST_SPEC)", - "BriefDescription": "This metric measures scalable vector operations with predicates as a percentage of operations speculatively executed.", - "ScaleUnit": "1percent of operations", - "MetricGroup": "SVE_Effectiveness" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json deleted file mode 100644 index 0d657789ee482..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json +++ /dev/null @@ -1,646 +0,0 @@ -[ - { - "ArchStdEvent": "SW_INCR", - "PublicDescription": "The Event counts software writes to the PMSWINC_EL0 (software PMU increment) register. The PMSWINC_EL0 register is a manually updated counter for use by application software. This Event could be used to measure any user program Event, such as accesses to a particular Data structure (by writing to the PMSWINC_EL0 register each time the Data structure is accessed). To use the PMSWINC_EL0 register and Event, developers must insert Instructions that write to the PMSWINC_EL0 register into the source code. Since the SW_INCR Event records Writes to the PMSWINC_EL0 register, there is no need to do a Read/Increment/Write sequence to the PMSWINC_EL0 register." - }, - { - "ArchStdEvent": "CHAIN", - "PublicDescription": "For odd-numbered counters, this Event increments the count by one for each overflow of the preceding even-numbered counter. For even-numbered counters, there is no increment. This Event is used when the even/odd pairs of registers are used as a single counter." - }, - { - "ArchStdEvent": "TRB_WRAP", - "PublicDescription": "The Event is generated each time the trace buffer current Write pointer is wrapped to the trace buffer base pointer." - }, - { - "ArchStdEvent": "TRCEXTOUT0", - "PublicDescription": "Trace unit external output 0." - }, - { - "ArchStdEvent": "TRCEXTOUT1", - "PublicDescription": "Trace unit external output 1." - }, - { - "ArchStdEvent": "TRCEXTOUT2", - "PublicDescription": "Trace unit external output 2." - }, - { - "ArchStdEvent": "TRCEXTOUT3", - "PublicDescription": "Trace unit external output 3." - }, - { - "ArchStdEvent": "CTI_TRIGOUT4", - "PublicDescription": "Cross-trigger Interface output trigger 4." - }, - { - "ArchStdEvent": "CTI_TRIGOUT5", - "PublicDescription": "Cross-trigger Interface output trigger 5." - }, - { - "ArchStdEvent": "CTI_TRIGOUT6", - "PublicDescription": "Cross-trigger Interface output trigger 6." - }, - { - "ArchStdEvent": "CTI_TRIGOUT7", - "PublicDescription": "Cross-trigger Interface output trigger 7." - }, - { - "EventCode": "0x00e1", - "EventName": "L1I_PRFM_REQ_DROP", - "PublicDescription": "L1 I-cache software prefetch dropped." - }, - { - "EventCode": "0x0100", - "EventName": "L1_PF_REFILL", - "PublicDescription": "L1 prefetch requests, refilled to L1 cache." - }, - { - "EventCode": "0x0120", - "EventName": "FLUSH", - "PublicDescription": "The Event counts both the CT flush and BX flush. The BR_MIS_PRED counts the BX flushes. So the FLUSH-BR_MIS_PRED gives the CT flushes." - }, - { - "EventCode": "0x0121", - "EventName": "FLUSH_MEM", - "PublicDescription": "Flushes due to memory hazards. This only includes CT flushes." - }, - { - "EventCode": "0x0122", - "EventName": "FLUSH_BAD_BRANCH", - "PublicDescription": "Flushes due to bad predicted Branch. This only includes CT flushes." - }, - { - "EventCode": "0x0124", - "EventName": "FLUSH_ISB", - "PublicDescription": "Flushes due to ISB or similar side-effects. This only includes CT flushes." - }, - { - "EventCode": "0x0125", - "EventName": "FLUSH_OTHER", - "PublicDescription": "Flushes due to other hazards. This only includes CT flushes." - }, - { - "EventCode": "0x0126", - "EventName": "STORE_STREAM", - "PublicDescription": "Stored lines in streaming no-Write-allocate mode." - }, - { - "EventCode": "0x0127", - "EventName": "NUKE_RAR", - "PublicDescription": "Load/Store nuke due to Read-after-Read ordering hazard." - }, - { - "EventCode": "0x0128", - "EventName": "NUKE_RAW", - "PublicDescription": "Load/Store nuke due to Read-after-Write ordering hazard." - }, - { - "EventCode": "0x0129", - "EventName": "L1_PF_GEN_PAGE", - "PublicDescription": "Load/Store prefetch to L1 generated, Page mode." - }, - { - "EventCode": "0x012a", - "EventName": "L1_PF_GEN_STRIDE", - "PublicDescription": "Load/Store prefetch to L1 generated, stride mode." - }, - { - "EventCode": "0x012b", - "EventName": "L2_PF_GEN_LD", - "PublicDescription": "Load prefetch to L2 generated." - }, - { - "EventCode": "0x012d", - "EventName": "LS_PF_TRAIN_TABLE_ALLOC", - "PublicDescription": "LS prefetch train table entry allocated." - }, - { - "EventCode": "0x0130", - "EventName": "LS_PF_GEN_TABLE_ALLOC", - "PublicDescription": "The Event counts the number of cycles with at least one table allocation, for L2 hardware prefetches (including the SW PRFM that are converted into hardware prefetches due to D-TLB miss). LS prefetch gen table allocation (for L2 prefetches)." - }, - { - "EventCode": "0x0131", - "EventName": "LS_PF_GEN_TABLE_ALLOC_PF_PEND", - "PublicDescription": "The Event counts the number of cycles in which at least one hardware prefetch is dropped due to the inability to identify a victim when the generation table is full. The hardware prefetch considered here includes the software PRFM that is converted into hardware prefetches due to D-TLB miss." - }, - { - "EventCode": "0x0132", - "EventName": "TBW", - "PublicDescription": "Tablewalks." - }, - { - "EventCode": "0x0134", - "EventName": "S1L2_HIT", - "PublicDescription": "Translation cache hit on S1L2 walk cache entry." - }, - { - "EventCode": "0x0135", - "EventName": "S1L1_HIT", - "PublicDescription": "Translation cache hit on S1L1 walk cache entry." - }, - { - "EventCode": "0x0136", - "EventName": "S1L0_HIT", - "PublicDescription": "Translation cache hit on S1L0 walk cache entry." - }, - { - "EventCode": "0x0137", - "EventName": "S2L2_HIT", - "PublicDescription": "Translation cache hit for S2L2 IPA walk cache entry." - }, - { - "EventCode": "0x0138", - "EventName": "IPA_REQ", - "PublicDescription": "Translation cache lookups for IPA to PA entries." - }, - { - "EventCode": "0x0139", - "EventName": "IPA_REFILL", - "PublicDescription": "Translation cache refills for IPA to PA entries." - }, - { - "EventCode": "0x013a", - "EventName": "S1_FLT", - "PublicDescription": "Stage1 tablewalk fault." - }, - { - "EventCode": "0x013b", - "EventName": "S2_FLT", - "PublicDescription": "Stage2 tablewalk fault." - }, - { - "EventCode": "0x013c", - "EventName": "COLT_REFILL", - "PublicDescription": "Aggregated page refill." - }, - { - "EventCode": "0x0145", - "EventName": "L1_PF_HIT", - "PublicDescription": "L1 prefetch requests, hitting in L1 cache." - }, - { - "EventCode": "0x0146", - "EventName": "L1_PF", - "PublicDescription": "L1 prefetch requests." - }, - { - "EventCode": "0x0147", - "EventName": "CACHE_LS_REFILL", - "PublicDescription": "L2 D-cache refill, Load/Store." - }, - { - "EventCode": "0x0148", - "EventName": "CACHE_PF", - "PublicDescription": "L2 prefetch requests." - }, - { - "EventCode": "0x0149", - "EventName": "CACHE_PF_HIT", - "PublicDescription": "L2 prefetch requests, hitting in L2 cache." - }, - { - "EventCode": "0x0150", - "EventName": "UNUSED_PF", - "PublicDescription": "L2 unused prefetch." - }, - { - "EventCode": "0x0151", - "EventName": "PFT_SENT", - "PublicDescription": "L2 prefetch TGT sent. Note that PFT_SENT != PFT_USEFUL + PFT_DROP. There may be PFT_SENT for which the accesses resulted in a SLC hit." - }, - { - "EventCode": "0x0152", - "EventName": "PFT_USEFUL", - "PublicDescription": "L2 prefetch TGT useful." - }, - { - "EventCode": "0x0153", - "EventName": "PFT_DROP", - "PublicDescription": "L2 prefetch TGT dropped." - }, - { - "EventCode": "0x0162", - "EventName": "LRQ_FULL", - "PublicDescription": "The Event counts the number of cycles the LRQ is full." - }, - { - "EventCode": "0x0163", - "EventName": "FETCH_FQ_EMPTY", - "PublicDescription": "Fetch Queue empty cycles." - }, - { - "EventCode": "0x0164", - "EventName": "FPG2", - "PublicDescription": "Forward progress guarantee. Medium range livelock triggered." - }, - { - "EventCode": "0x0165", - "EventName": "FPG", - "PublicDescription": "Forward progress guarantee. Tofu global livelock buster is triggered." - }, - { - "EventCode": "0x0172", - "EventName": "DEADBLOCK", - "PublicDescription": "Write-back evictions converted to Dataless EVICT. The victim line is deemed deadblock if the likeliness of a reuse is low. The Core uses Dataless evict to evict a deadblock; And it uses a evict with Data to evict an L2 line that is not a deadblock." - }, - { - "EventCode": "0x0173", - "EventName": "PF_PRQ_ALLOC_PF_PEND", - "PublicDescription": "L1 prefetch prq allocation (replacing pending)." - }, - { - "EventCode": "0x0178", - "EventName": "FETCH_ICACHE_INSTR", - "PublicDescription": "Instructions fetched from I-cache." - }, - { - "EventCode": "0x017b", - "EventName": "NEAR_CAS", - "PublicDescription": "Near atomics: compare and swap." - }, - { - "EventCode": "0x017c", - "EventName": "NEAR_CAS_PASS", - "PublicDescription": "Near atomics: compare and swap pass." - }, - { - "EventCode": "0x017d", - "EventName": "FAR_CAS", - "PublicDescription": "Far atomics: compare and swap." - }, - { - "EventCode": "0x0186", - "EventName": "L2_BTB_RELOAD_MAIN_BTB", - "PublicDescription": "Number of completed L1 BTB update initiated by L2 BTB hit which swap Branch information between L1 BTB and L2 BTB." - }, - { - "EventCode": "0x0190", - "EventName": "PF_MODE_0_CYCLES", - "PublicDescription": "Number of cycles in which the hardware prefetcher is in the most aggressive mode." - }, - { - "EventCode": "0x0191", - "EventName": "PF_MODE_1_CYCLES", - "PublicDescription": "Number of cycles in which the hardware prefetcher is in the more aggressive mode." - }, - { - "EventCode": "0x0192", - "EventName": "PF_MODE_2_CYCLES", - "PublicDescription": "Number of cycles in which the hardware prefetcher is in the less aggressive mode." - }, - { - "EventCode": "0x0193", - "EventName": "PF_MODE_3_CYCLES", - "PublicDescription": "Number of cycles in which the hardware prefetcher is in the most conservative mode." - }, - { - "EventCode": "0x0194", - "EventName": "TXREQ_LIMIT_MAX_CYCLES", - "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is the L2_TQ_SIZE." - }, - { - "EventCode": "0x0195", - "EventName": "TXREQ_LIMIT_3QUARTER_CYCLES", - "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 3/4 of the L2_TQ_SIZE and the L2_TQ_SIZE-1." - }, - { - "EventCode": "0x0196", - "EventName": "TXREQ_LIMIT_HALF_CYCLES", - "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 1/2 of the L2_TQ_SIZE and 3/4 of the L2_TQ_SIZE." - }, - { - "EventCode": "0x0197", - "EventName": "TXREQ_LIMIT_1QUARTER_CYCLES", - "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 1/4 of the L2_TQ_SIZE and 1/2 of the L2_TQ_SIZE." - }, - { - "EventCode": "0x019d", - "EventName": "PREFETCH_LATE_CMC", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by CMC prefetch request." - }, - { - "EventCode": "0x019e", - "EventName": "PREFETCH_LATE_BO", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by BO prefetch request." - }, - { - "EventCode": "0x019f", - "EventName": "PREFETCH_LATE_STRIDE", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by STRIDE prefetch request." - }, - { - "EventCode": "0x01a0", - "EventName": "PREFETCH_LATE_SPATIAL", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by SPATIAL prefetch request." - }, - { - "EventCode": "0x01a2", - "EventName": "PREFETCH_LATE_TBW", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by TBW prefetch request." - }, - { - "EventCode": "0x01a3", - "EventName": "PREFETCH_LATE_PAGE", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by PAGE prefetch request." - }, - { - "EventCode": "0x01a4", - "EventName": "PREFETCH_LATE_GSMS", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by GSMS prefetch request." - }, - { - "EventCode": "0x01a5", - "EventName": "PREFETCH_LATE_SIP_CONS", - "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by SIP_CONS prefetch request." - }, - { - "EventCode": "0x01a6", - "EventName": "PREFETCH_REFILL_CMC", - "PublicDescription": "PF/prefetch or PF/readclean request from CMC pf engine filled the L2 cache." - }, - { - "EventCode": "0x01a7", - "EventName": "PREFETCH_REFILL_BO", - "PublicDescription": "PF/prefetch or PF/readclean request from BO pf engine filled the L2 cache." - }, - { - "EventCode": "0x01a8", - "EventName": "PREFETCH_REFILL_STRIDE", - "PublicDescription": "PF/prefetch or PF/readclean request from STRIDE pf engine filled the L2 cache." - }, - { - "EventCode": "0x01a9", - "EventName": "PREFETCH_REFILL_SPATIAL", - "PublicDescription": "PF/prefetch or PF/readclean request from SPATIAL pf engine filled the L2 cache." - }, - { - "EventCode": "0x01ab", - "EventName": "PREFETCH_REFILL_TBW", - "PublicDescription": "PF/prefetch or PF/readclean request from TBW pf engine filled the L2 cache." - }, - { - "EventCode": "0x01ac", - "EventName": "PREFETCH_REFILL_PAGE", - "PublicDescription": "PF/prefetch or PF/readclean request from PAGE pf engine filled the L2 cache." - }, - { - "EventCode": "0x01ad", - "EventName": "PREFETCH_REFILL_GSMS", - "PublicDescription": "PF/prefetch or PF/readclean request from GSMS pf engine filled the L2 cache." - }, - { - "EventCode": "0x01ae", - "EventName": "PREFETCH_REFILL_SIP_CONS", - "PublicDescription": "PF/prefetch or PF/readclean request from SIP_CONS pf engine filled the L2 cache." - }, - { - "EventCode": "0x01af", - "EventName": "CACHE_HIT_LINE_PF_CMC", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by CMC prefetch request." - }, - { - "EventCode": "0x01b0", - "EventName": "CACHE_HIT_LINE_PF_BO", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by BO prefetch request." - }, - { - "EventCode": "0x01b1", - "EventName": "CACHE_HIT_LINE_PF_STRIDE", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by STRIDE prefetch request." - }, - { - "EventCode": "0x01b2", - "EventName": "CACHE_HIT_LINE_PF_SPATIAL", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by SPATIAL prefetch request." - }, - { - "EventCode": "0x01b4", - "EventName": "CACHE_HIT_LINE_PF_TBW", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by TBW prefetch request." - }, - { - "EventCode": "0x01b5", - "EventName": "CACHE_HIT_LINE_PF_PAGE", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by PAGE prefetch request." - }, - { - "EventCode": "0x01b6", - "EventName": "CACHE_HIT_LINE_PF_GSMS", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by GSMS prefetch request." - }, - { - "EventCode": "0x01b7", - "EventName": "CACHE_HIT_LINE_PF_SIP_CONS", - "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by SIP_CONS prefetch request." - }, - { - "EventCode": "0x01cb", - "EventName": "L2_TQ_OUTSTANDING", - "PublicDescription": "Outstanding tracker count, per cycle. The Event increments by the number of valid entries pertaining to this thread in the L2TQ, in each cycle. The Event can be used to calculate the occupancy of L2TQ by dividing this by the CPU_CYCLES Event. The L2TQ queue tracks the outstanding Read, Write ,and Snoop transactions. The Read transaction and the Write transaction entries are attributable to PE, whereas the Snoop transactions are not always attributable to PE." - }, - { - "EventCode": "0x01cc", - "EventName": "TXREQ_LIMIT_COUNT_CYCLES", - "PublicDescription": "The Event increments by the dynamic TXREQ value, in each cycle. This is a companion Event of TXREQ_LIMIT_MAX_CYCLES, TXREQ_LIMIT_3QUARTER_CYCLES, TXREQ_LIMIT_HALF_CYCLES, and TXREQ_LIMIT_1QUARTER_CYCLES." - }, - { - "EventCode": "0x01d2", - "EventName": "DVM_TLBI_RCVD", - "PublicDescription": "The Event counts the number of TLBI DVM message received over CHI interface, for *this* Core." - }, - { - "EventCode": "0x01d6", - "EventName": "DSB_COMMITING_LOCAL_TLBI", - "PublicDescription": "The Event counts the number of DSB that are retired and committed at least one local TLBI Instruction. This Event increments no more than once (in a cycle) even if the DSB commits multiple local TLBI Instruction." - }, - { - "EventCode": "0x01d7", - "EventName": "DSB_COMMITING_BROADCAST_TLBI", - "PublicDescription": "The Event counts the number of DSB that are retired and committed at least one broadcast TLBI Instruction. This Event increments no more than once (in a cycle) even if the DSB commits multiple broadcast TLBI Instruction." - }, - { - "EventCode": "0x01f0", - "EventName": "TMS_ST_TO_SMT_LATENCY", - "PublicDescription": "The Event counts the number of CPU cycles spent on TMS for ST-to-SMT switch. This Event is counted by both the threads - The Event in both threads increment during TMS for ST-to-SMT switch." - }, - { - "EventCode": "0x01f1", - "EventName": "TMS_SMT_TO_ST_LATENCY", - "PublicDescription": "The Event counts the number of CPU cycles spent on TMS for SMT-to-ST switch. The count also includes the CPU cycles spend due to an aborted SMT-to-ST TMS attempt. This Event is counted only by the thread that is not in WFI." - }, - { - "EventCode": "0x01f2", - "EventName": "TMS_ST_TO_SMT_COUNT", - "PublicDescription": "The Event counts the number of completed TMS from ST-to-SMT. This Event is counted only by the active thread (the one that is not in WFI). Note: When an active thread enters the Debug state in ST-Full resource mode, it is switched to SMT mode. This is because the inactive thread cannot wake up while the other thread remains in the Debug state. To prEvent this issue, threads operating in ST-Full resource mode are transitioned to SMT mode upon entering Debug state. The Event count will also reflect such switches from ST to SMT mode. (Also see the (NV_CPUACTLR14_EL1.chka_prEvent_st_tx_to_smt_when_tx_in_debug_state bit to disable this behavior.)" - }, - { - "EventCode": "0x01f3", - "EventName": "TMS_SMT_TO_ST_COUNT", - "PublicDescription": "The Event counts the number of completed TMS from SMT-to-ST. This Event is counted only by the thread that is not in WFI." - }, - { - "EventCode": "0x01f4", - "EventName": "TMS_SMT_TO_ST_COUNT_ABRT", - "PublicDescription": "The Event counts the number of aborted TMS from SMT-to-ST. This Event is counted only by the thread that is not in WFI." - }, - { - "EventCode": "0x021c", - "EventName": "CWT_ALLOC_ENTRY", - "PublicDescription": "Cache Way Tracker Allocate entry." - }, - { - "EventCode": "0x021d", - "EventName": "CWT_ALLOC_LINE", - "PublicDescription": "Cache Way Tracker Allocate line." - }, - { - "EventCode": "0x021e", - "EventName": "CWT_HIT", - "PublicDescription": "Cache Way Tracker hit." - }, - { - "EventCode": "0x021f", - "EventName": "CWT_HIT_TAG", - "PublicDescription": "Cache Way Tracker hit when ITAG lookup suppressed." - }, - { - "EventCode": "0x0220", - "EventName": "CWT_REPLAY_TAG", - "PublicDescription": "Cache Way Tracker causes ITAG replay due to miss when ITAG lookup suppressed." - }, - { - "EventCode": "0x0250", - "EventName": "GPT_REQ", - "PublicDescription": "GPT lookup." - }, - { - "EventCode": "0x0251", - "EventName": "GPT_WC_HIT", - "PublicDescription": "GPT lookup hit in Walk cache." - }, - { - "EventCode": "0x0252", - "EventName": "GPT_PG_HIT", - "PublicDescription": "GPT lookup hit in TLB." - }, - { - "EventCode": "0x01ba", - "EventName": "PREFETCH_LATE_STORE_ISSUE", - "PublicDescription": "The Event counts the number of demand requests that matches a Store-issue prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." - }, - { - "EventCode": "0x01bb", - "EventName": "PREFETCH_LATE_STORE_STRIDE", - "PublicDescription": "The Event counts the number of demand requests that matches a Store-stride prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." - }, - { - "EventCode": "0x01bc", - "EventName": "PREFETCH_LATE_PC_OFFSET", - "PublicDescription": "The Event counts the number of demand requests that matches a PC-offset prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." - }, - { - "EventCode": "0x01bd", - "EventName": "PREFETCH_LATE_IFUPF", - "PublicDescription": "The Event counts the number of demand requests that matches a IFU prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." - }, - { - "EventCode": "0x01be", - "EventName": "PREFETCH_REFILL_STORE_ISSUE", - "PublicDescription": "The Event counts the number of cache refills due to Store-Issue prefetcher." - }, - { - "EventCode": "0x01bf", - "EventName": "PREFETCH_REFILL_STORE_STRIDE", - "PublicDescription": "The Event counts the number of cache refills due to Store-stride prefetcher." - }, - { - "EventCode": "0x01c0", - "EventName": "PREFETCH_REFILL_PC_OFFSET", - "PublicDescription": "The Event counts the number of cache refills due to PC-offset prefetcher." - }, - { - "EventCode": "0x01c1", - "EventName": "PREFETCH_REFILL_IFUPF", - "PublicDescription": "The Event counts the number of cache refills due to IFU prefetcher." - }, - { - "EventCode": "0x01c2", - "EventName": "CACHE_HIT_LINE_PF_STORE_ISSUE", - "PublicDescription": "The Event counts the number of first hit to a cache line filled by Store-issue prefetcher." - }, - { - "EventCode": "0x01c3", - "EventName": "CACHE_HIT_LINE_PF_STORE_STRIDE", - "PublicDescription": "The Event counts the number of first hit to a cache line filled by Store-stride prefetcher." - }, - { - "EventCode": "0x01c4", - "EventName": "CACHE_HIT_LINE_PF_PC_OFFSET", - "PublicDescription": "The Event counts the number of first hit to a cache line filled by PC-offset prefetcher." - }, - { - "EventCode": "0x01c5", - "EventName": "CACHE_HIT_LINE_PF_IFUPF", - "PublicDescription": "The Event counts the number of first hit to a cache line filled by IFU prefetcher." - }, - { - "EventCode": "0x01c6", - "EventName": "L2_PF_GEN_ST_ISSUE", - "PublicDescription": "Store-issue prefetch to L2 generated." - }, - { - "EventCode": "0x01c7", - "EventName": "L2_PF_GEN_ST_STRIDE", - "PublicDescription": "Store-stride prefetch to L2 generated" - }, - { - "EventCode": "0x01ee", - "EventName": "CACHE_HIT_LINE_PF_CONVERTED_PRFM", - "PublicDescription": "The Event counts the number of first hit to a cache line filled by Converted-L1D-PRFM or Converted-L2D-PRFM. Note that L2D_CACHE_HIT_RWL1PRF_FPRFM is inclusive of CACHE_HIT_LINE_PF_CONVERTED_PRFM, where both the CACHE_HIT_LINE_PF_CONVERTED_PRFM and the L2D_CACHE_HIT_RWL1PRF_FPRFM increment on a first hit to L2 D-cache filled by Converted-L1D-PRFM or Converted-L2D-PRFM." - }, - { - "EventCode": "0x01ec", - "EventName": "PREFETCH_LATE_CONVERTED_PRFM", - "PublicDescription": "The Event counts the number of demand requests that matches a Converted-L1D-PRFM or Converted-L2D-PRFM pending refill request at L2 D-cache. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements. Note that this Event is not counted by the L2D_CACHE_HIT_RWL1PRF_LATE_HWPRF, though the Converted-L1D-PRFM or Converted-L2D-PRFM are replayed by the L2PRQ." - }, - { - "EventCode": "0x01ed", - "EventName": "PREFETCH_REFILL_CONVERTED_PRFM", - "PublicDescription": "The Event counts the number of L2 D-cache refills due to Converted-L1D-PRFM or Converted-L2D-PRFM. Note : L2D_CACHE_REFILL_PRFM is inclusive of PREFETCH_REFILL_PRFM_CONVERTED, where both the PREFETCH_REFILL_PRFM_CONVERTED and the L2D_CACHE_REFILL_PRFM increment when L2 D-cache refills due to Converted-L1D-PRFM or Converted-L2D-PRFM." - }, - { - "EventCode": "0x01eb", - "EventName": "L1DPRFM_L2DPRFM_TO_L2PRQ_CONVERTED", - "PublicDescription": "The Event counts the number of Converted-L1D-PRFMs and Converted-L2D-PRFM. Activities involving the Converted-L1D-PRFM are counted by the L1D_CACHE_PRFM. However they are *not* counted by the L1D_CACHE_REFILL_PRFM, and L1D_CACHE_REFILL, as these Converted-L1D-PRFM are treated as L2 D hardware prefetches. Activities around the Converted-L1D-PRFMs and Converted-L2D-PRFMs are counted by the L2D_CACHE_PRFM, L2D_CACHE_REFILL_PRFM and L2D_CACHE_REFILL Events." - }, - { - "EventCode": "0x01ce", - "EventName": "L3DPRFM_TO_L2PRQ_CONVERTED", - "PublicDescription": "The Event counts the number of Converted-L3D-PRFMs. These are indeed L3D PRFM and activities around these PRFM are counted by the L3D_CACHE_PRFM, L3D_CACHE_REFILL_PRFM and L3D_CACHE_REFILL Events." - }, - { - "EventCode": "0x0202", - "EventName": "L0I_CACHE_RD", - "PublicDescription": "The Event counts the number of predict blocks serviced out of L0 I-cache. Note: The L0 I-cache performs at most 4 L0 I look-up in a cycle. Two of which are to service PB from L0 I. And the other two to refill L0 I-cache from L1 I. This Event count only the L0 I-cache lookup pertaining to servicing the PB from L0 I." - }, - { - "EventCode": "0x0203", - "EventName": "L0I_CACHE_REFILL", - "PublicDescription": "The Event counts the number of L0I cache refill from L1 I-cache." - }, - { - "EventCode": "0x0207", - "EventName": "INTR_LATENCY", - "PublicDescription": "The Event counts the number of cycles elapsed between when an Interrupt is recognized (after masking) to when a uop associated with the first Instruction in the destination exception level is allocated. If there is some other flush condition that pre-empts the Interrupt, then the cycles counted terminates early at the first Instruction executed after that flush. In the Event of dropped Interrupts (when an Interrupt is deasserted before it is taken), this counter measures the number of cycles that elapse from the moment an Interrupt is recognized (post-masking) until the Interrupt is dropped or deasserted." - }, - { - "EventCode": "0x018f", - "EventName": "L1_PF_GEN_MCMC", - "PublicDescription": "Load/Store prefetch to L1 generated, MCMC." - }, - { - "EventCode": "0x0123", - "EventName": "FLUSH_STDBYPASS", - "PublicDescription": "Flushes due to bad predecode. This only includes CT flushes." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json deleted file mode 100644 index ab384239541b8..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json +++ /dev/null @@ -1,94 +0,0 @@ -[ - { - "ArchStdEvent": "INST_RETIRED", - "PublicDescription": "The Event counts Instructions that have been architecturally executed." - }, - { - "ArchStdEvent": "CID_WRITE_RETIRED", - "PublicDescription": "The Event counts architecturally executed Writes to the CONTEXTIDR_EL1 register, which usually contain the kernel PID and can be output with hardware trace." - }, - { - "ArchStdEvent": "BR_IMMED_RETIRED", - "PublicDescription": "The Event counts architecturally executed direct Branches." - }, - { - "ArchStdEvent": "BR_RETURN_RETIRED", - "PublicDescription": "The Event counts architecturally executed procedure returns." - }, - { - "ArchStdEvent": "TTBR_WRITE_RETIRED", - "PublicDescription": "The Event counts architectural Writes to TTBR0/1_EL1. If virtualization host extensions are enabled (by setting the HCR_EL2.E2H bit to 1), then accesses to TTBR0/1_EL1 that are redirected to TTBR0/1_EL2, or accesses to TTBR0/1_EL12, are counted. TTBRn registers are typically updated when the kernel is swapping user-space threads or applications." - }, - { - "ArchStdEvent": "BR_RETIRED", - "PublicDescription": "The Event counts architecturally executed Branches, whether the Branch is taken or not. Instructions that explicitly write to the PC are also counted. Note that exception generating Instructions, exception return Instructions and context synchronization Instructions are not counted." - }, - { - "ArchStdEvent": "BR_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts Branches counted by BR_RETIRED which were mispredicted and caused a pipeline flush." - }, - { - "ArchStdEvent": "OP_RETIRED", - "PublicDescription": "The Event counts micro-operations that are architecturally executed. This is a count of number of micro-operations retired from the commit queue in a single cycle." - }, - { - "ArchStdEvent": "BR_INDNR_TAKEN_RETIRED", - "PublicDescription": "The Event counts architecturally executed indirect Branches excluding procedure returns that were taken." - }, - { - "ArchStdEvent": "BR_IMMED_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed direct Branches that were correctly predicted." - }, - { - "ArchStdEvent": "BR_IMMED_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed direct Branches that were mispredicted and caused a pipeline flush." - }, - { - "ArchStdEvent": "BR_IND_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed indirect Branches including procedure returns that were correctly predicted." - }, - { - "ArchStdEvent": "BR_IND_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed indirect Branches including procedure returns that were mispredicted and caused a pipeline flush." - }, - { - "ArchStdEvent": "BR_RETURN_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed procedure returns that were correctly predicted." - }, - { - "ArchStdEvent": "BR_RETURN_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed procedure returns that were mispredicted and caused a pipeline flush." - }, - { - "ArchStdEvent": "BR_INDNR_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed indirect Branches excluding procedure returns that were correctly predicted." - }, - { - "ArchStdEvent": "BR_INDNR_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed indirect Branches excluding procedure returns that were mispredicted and caused a pipeline flush." - }, - { - "ArchStdEvent": "BR_TAKEN_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed Branches that were taken and were correctly predicted." - }, - { - "ArchStdEvent": "BR_TAKEN_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed branches that were taken and were mispredicted causing a pipeline flush." - }, - { - "ArchStdEvent": "BR_SKIP_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed Branches that were not taken and were correctly predicted." - }, - { - "ArchStdEvent": "BR_SKIP_MIS_PRED_RETIRED", - "PublicDescription": "The Event counts architecturally executed Branches that were not taken and were mispredicted causing a pipeline flush." - }, - { - "ArchStdEvent": "BR_PRED_RETIRED", - "PublicDescription": "The Event counts Branch Instructions counted by BR_RETIRED which were correctly predicted." - }, - { - "ArchStdEvent": "BR_IND_RETIRED", - "PublicDescription": "The Event counts architecturally executed indirect Branches including procedure returns." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json deleted file mode 100644 index 8d1fe2a8b161f..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "ArchStdEvent": "SAMPLE_POP", - "PublicDescription": "The Event counts statistical profiling sample population, the count of all operations that could be sampled but may or may not be chosen for sampling." - }, - { - "ArchStdEvent": "SAMPLE_FEED", - "PublicDescription": "The Event counts statistical profiling samples taken for sampling." - }, - { - "ArchStdEvent": "SAMPLE_FILTRATE", - "PublicDescription": "The Event counts statistical profiling samples taken which are not removed by filtering." - }, - { - "ArchStdEvent": "SAMPLE_COLLISION", - "PublicDescription": "The Event counts statistical profiling samples that have collided with a previous sample and so therefore not taken." - }, - { - "ArchStdEvent": "SAMPLE_FEED_BR", - "PublicDescription": "The Event counts statistical profiling samples taken which are Branches." - }, - { - "ArchStdEvent": "SAMPLE_FEED_LD", - "PublicDescription": "The Event counts statistical profiling samples taken which are Loads or Load atomic operations." - }, - { - "ArchStdEvent": "SAMPLE_FEED_ST", - "PublicDescription": "The Event counts statistical profiling samples taken which are Stores or Store atomic operations." - }, - { - "ArchStdEvent": "SAMPLE_FEED_OP", - "PublicDescription": "The Event counts statistical profiling samples taken which are matching any operation type filters supported." - }, - { - "ArchStdEvent": "SAMPLE_FEED_EVENT", - "PublicDescription": "The Event counts statistical profiling samples taken which are matching Event packet filter constraints." - }, - { - "ArchStdEvent": "SAMPLE_FEED_LAT", - "PublicDescription": "The Event counts statistical profiling samples taken which are exceeding minimum latency set by operation latency filter constraints." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json deleted file mode 100644 index 18df3f7011150..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json +++ /dev/null @@ -1,230 +0,0 @@ -[ - { - "ArchStdEvent": "INST_SPEC", - "PublicDescription": "The Event counts operations that have been speculatively executed." - }, - { - "ArchStdEvent": "OP_SPEC", - "PublicDescription": "The Event counts micro-operations speculatively executed. This is the count of the number of micro-operations dispatched in a cycle." - }, - { - "ArchStdEvent": "UNALIGNED_LD_SPEC", - "PublicDescription": "The Event counts unaligned memory Read operations issued by the CPU. This Event counts unaligned accesses (as defined by the actual Instruction), even if they are subsequently issued as multiple aligned accesses. The Event does not count preload operations (PLD, PLI). This Event is a subset of the UNALIGNED_LDST_SPEC Event." - }, - { - "ArchStdEvent": "UNALIGNED_ST_SPEC", - "PublicDescription": "The Event counts unaligned memory Write operations issued by the CPU. This Event counts unaligned accesses (as defined by the actual Instruction), even if they are subsequently issued as multiple aligned accesses. This Event is a subset of the UNALIGNED_LDST_SPEC Event." - }, - { - "ArchStdEvent": "UNALIGNED_LDST_SPEC", - "PublicDescription": "The Event counts unaligned memory operations issued by the CPU. This Event counts unaligned accesses (as defined by the actual Instruction), even if they are subsequently issued as multiple aligned accesses. This Event is the sum of the UNALIGNED_ST_SPEC and UNALIGNED_LD_SPEC Events." - }, - { - "ArchStdEvent": "LDREX_SPEC", - "PublicDescription": "The Event counts Load-Exclusive operations that have been speculatively executed. For example: LDREX, LDX" - }, - { - "ArchStdEvent": "STREX_PASS_SPEC", - "PublicDescription": "The Event counts Store-exclusive operations that have been speculatively executed and have successfully completed the Store operation." - }, - { - "ArchStdEvent": "STREX_FAIL_SPEC", - "PublicDescription": "The Event counts Store-exclusive operations that have been speculatively executed and have not successfully completed the Store operation." - }, - { - "ArchStdEvent": "STREX_SPEC", - "PublicDescription": "The Event counts Store-exclusive operations that have been speculatively executed. This Event is the sum of STREX_PASS_SPEC and STREX_FAIL_SPEC Events." - }, - { - "ArchStdEvent": "LD_SPEC", - "PublicDescription": "The Event counts speculatively executed Load operations including Single Instruction Multiple Data (SIMD) Load operations." - }, - { - "ArchStdEvent": "ST_SPEC", - "PublicDescription": "The Event counts speculatively executed Store operations including Single Instruction Multiple Data (SIMD) Store operations." - }, - { - "ArchStdEvent": "LDST_SPEC", - "PublicDescription": "The Event counts Load and Store operations that have been speculatively executed." - }, - { - "ArchStdEvent": "DP_SPEC", - "PublicDescription": "The Event counts speculatively executed logical or arithmetic Instructions such as MOV/MVN operations." - }, - { - "ArchStdEvent": "ASE_SPEC", - "PublicDescription": "The Event counts speculatively executed Advanced SIMD operations excluding Load, Store, and Move micro-operations that move Data to or from SIMD (vector) registers." - }, - { - "ArchStdEvent": "VFP_SPEC", - "PublicDescription": "The Event counts speculatively executed floating point operations. This Event does not count operations that move Data to or from floating point (vector) registers." - }, - { - "ArchStdEvent": "PC_WRITE_SPEC", - "PublicDescription": "The Event counts speculatively executed operations which cause software changes of the PC. Those operations include all taken Branch operations." - }, - { - "ArchStdEvent": "CRYPTO_SPEC", - "PublicDescription": "The Event counts speculatively executed cryptographic operations except for PMULL and VMULL operations." - }, - { - "ArchStdEvent": "BR_IMMED_SPEC", - "PublicDescription": "The Event counts direct Branch operations which are speculatively executed." - }, - { - "ArchStdEvent": "BR_RETURN_SPEC", - "PublicDescription": "The Event counts procedure return operations (RET, RETAA and RETAB) which are speculatively executed." - }, - { - "ArchStdEvent": "BR_INDIRECT_SPEC", - "PublicDescription": "The Event counts indirect branch operations including procedure returns, which are speculatively executed. This includes operations that force a software change of the PC, other than exception-generating operations and direct Branch Instructions. Some examples of the Instructions counted by this Event include BR Xn, RET, etc." - }, - { - "ArchStdEvent": "ISB_SPEC", - "PublicDescription": "The Event counts ISB operations that are executed." - }, - { - "ArchStdEvent": "DSB_SPEC", - "PublicDescription": "The Event counts DSB operations that are speculatively issued to Load/Store unit in the CPU." - }, - { - "ArchStdEvent": "DMB_SPEC", - "PublicDescription": "The Event counts DMB operations that are speculatively issued to the Load/Store unit in the CPU. This Event does not count implied barriers from Load-acquire/Store-release operations." - }, - { - "ArchStdEvent": "CSDB_SPEC", - "PublicDescription": "The Event counts CSDB operations that are speculatively issued to the Load/Store unit in the CPU. This Event does not count implied barriers from Load-acquire/Store-release operations." - }, - { - "ArchStdEvent": "RC_LD_SPEC", - "PublicDescription": "The Event counts any Load acquire operations that are speculatively executed. For example: LDAR, LDARH, LDARB" - }, - { - "ArchStdEvent": "RC_ST_SPEC", - "PublicDescription": "The Event counts any Store release operations that are speculatively executed. For example: STLR, STLRH, STLRB" - }, - { - "ArchStdEvent": "SIMD_INST_SPEC", - "PublicDescription": "The Event counts speculatively executed operations that are SIMD or SVE vector operations or Advanced SIMD non-scalar operations." - }, - { - "ArchStdEvent": "ASE_INST_SPEC", - "PublicDescription": "The Event counts speculatively executed Advanced SIMD operations." - }, - { - "ArchStdEvent": "SVE_INST_SPEC", - "PublicDescription": "The Event counts speculatively executed operations that are SVE operations." - }, - { - "ArchStdEvent": "INT_SPEC", - "PublicDescription": "The Event counts speculatively executed integer arithmetic operations." - }, - { - "ArchStdEvent": "SVE_PRED_SPEC", - "PublicDescription": "The Event counts speculatively executed predicated SVE operations. This counter also counts SVE operation due to Instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to Instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* Events." - }, - { - "ArchStdEvent": "SVE_PRED_EMPTY_SPEC", - "PublicDescription": "The Event counts speculatively executed predicated SVE operations with no active predicate elements. This counter also counts SVE operation due to Instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to Instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* Events." - }, - { - "ArchStdEvent": "SVE_PRED_FULL_SPEC", - "PublicDescription": "The Event counts speculatively executed predicated SVE operations with all predicate elements active. This counter also counts SVE operation due to Instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to Instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* Events." - }, - { - "ArchStdEvent": "SVE_PRED_PARTIAL_SPEC", - "PublicDescription": "The Event counts speculatively executed predicated SVE operations with at least one but not all active predicate elements. This counter also counts SVE operation due to Instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to Instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* Events." - }, - { - "ArchStdEvent": "SVE_PRED_NOT_FULL_SPEC", - "PublicDescription": "The Event counts speculatively executed predicated SVE operations with at least one non active predicate elements. This counter also counts SVE operation due to Instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to Instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* Events." - }, - { - "ArchStdEvent": "PRF_SPEC", - "PublicDescription": "The Event counts speculatively executed operations that prefetch memory. For example, Scalar: PRFM, SVE: PRFB, PRFD, PRFH, or PRFW." - }, - { - "ArchStdEvent": "SVE_LDFF_SPEC", - "PublicDescription": "The Event counts speculatively executed SVE first fault or non-fault Load operations." - }, - { - "ArchStdEvent": "SVE_LDFF_FAULT_SPEC", - "PublicDescription": "The Event counts speculatively executed SVE first fault or non-fault Load operations that clear at least one bit in the FFR." - }, - { - "ArchStdEvent": "ASE_SVE_INT8_SPEC", - "PublicDescription": "The Event counts speculatively executed Advanced SIMD or SVE integer operations with the largest Data type an 8-bit integer." - }, - { - "ArchStdEvent": "ASE_SVE_INT16_SPEC", - "PublicDescription": "The Event counts speculatively executed Advanced SIMD or SVE integer operations with the largest Data type a 16-bit integer." - }, - { - "ArchStdEvent": "ASE_SVE_INT32_SPEC", - "PublicDescription": "The Event counts speculatively executed Advanced SIMD or SVE integer operations with the largest Data type a 32-bit integer." - }, - { - "ArchStdEvent": "ASE_SVE_INT64_SPEC", - "PublicDescription": "The Event counts speculatively executed Advanced SIMD or SVE integer operations with the largest Data type a 64-bit integer." - }, - { - "EventCode": "0x011d", - "EventName": "SPEC_RET_STACK_FULL", - "PublicDescription": "The Event counts predict pipe stalls due to speculative return address predictor full." - }, - { - "EventCode": "0x011f", - "EventName": "MOPS_SPEC", - "PublicDescription": "Macro-ops speculatively decoded." - }, - { - "EventCode": "0x0180", - "EventName": "BR_SPEC_PRED_TAKEN", - "PublicDescription": "Number of Predicted Taken from Branch Predictor." - }, - { - "EventCode": "0x0181", - "EventName": "BR_SPEC_PRED_TAKEN_FROM_L2BTB", - "PublicDescription": "Number of Predicted Taken Branch from L2 BTB." - }, - { - "EventCode": "0x0182", - "EventName": "BR_SPEC_PRED_TAKEN_MULTI", - "PublicDescription": "Number of Predicted Taken for Polymorphic Branch." - }, - { - "EventCode": "0x0185", - "EventName": "BR_SPEC_PRED_STATIC", - "PublicDescription": "Number of post fetch prediction." - }, - { - "EventCode": "0x01d0", - "EventName": "TLBI_LOCAL_SPEC", - "PublicDescription": "A non-broadcast TLBI Instruction executed (Speculatively or otherwise) on *this* PE." - }, - { - "EventCode": "0x01d1", - "EventName": "TLBI_BROADCAST_SPEC", - "PublicDescription": "A broadcast TLBI Instruction executed (Speculatively or otherwise) on *this* PE." - }, - { - "EventCode": "0x0200", - "EventName": "SIMD_CRYPTO_INST_SPEC", - "PublicDescription": "SIMD, SVE, and CRYPTO Instructions speculatively decoded." - }, - { - "EventCode": "0x01e7", - "EventName": "BR_SPEC_PRED_ALN_REDIR", - "PublicDescription": "BPU predict pipe align redirect (either AL-APQ hit/miss)." - }, - { - "EventCode": "0x022e", - "EventName": "VPRED_LD_SPEC", - "PublicDescription": "The Event counts the number of Speculatively-executed-Load operations with addresses produced by the value-prediction mechanism. The loaded Data might be discarded if the predicted address differs from the actual address." - }, - { - "EventCode": "0x022f", - "EventName": "VPRED_LD_SPEC_MISMATCH", - "PublicDescription": "The Event counts a subset of VPRED_LD_SPEC where the predicted Load address and the actual address mismatched." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json deleted file mode 100644 index 652c1e3305d08..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json +++ /dev/null @@ -1,145 +0,0 @@ -[ - { - "ArchStdEvent": "STALL_FRONTEND", - "PublicDescription": "The Event counts cycles when frontend could not send any micro-operations to the rename stage because of frontend resource stalls caused by fetch memory latency or Branch prediction flow stalls. STALL_FRONTEND_SLOTS counts SLOTS during the cycle when this Event counts. STALL_SLOT_FRONTEND will count SLOTS when this Event is counted on this CPU." - }, - { - "ArchStdEvent": "STALL_BACKEND", - "PublicDescription": "The Event counts cycles whenever the rename unit is unable to send any micro-operations to the backend of the pipeline because of backend resource constraints. Backend resource constraints can include issue stage fullness, execution stage fullness, or other internal pipeline resource fullness. All the backend slots were empty during the cycle when this Event counts." - }, - { - "ArchStdEvent": "STALL", - "PublicDescription": "The Event counts cycles when no operations are sent to the rename unit from the frontend or from the rename unit to the backend for any reason (either frontend or backend stall). This Event is the sum of STALL_FRONTEND and STALL_BACKEND." - }, - { - "ArchStdEvent": "STALL_SLOT_BACKEND", - "PublicDescription": "The Event counts slots per cycle in which no operations are sent from the rename unit to the backend due to backend resource constraints. STALL_BACKEND counts during the cycle when STALL_SLOT_BACKEND counts at least 1. STALL_BACKEND counts during the cycle when STALL_SLOT_BACKEND is SLOTS." - }, - { - "ArchStdEvent": "STALL_SLOT_FRONTEND", - "PublicDescription": "The Event counts slots per cycle in which no operations are sent to the rename unit from the frontend due to frontend resource constraints. STALL_FRONTEND counts during the cycle when STALL_SLOT_FRONTEND is SLOTS." - }, - { - "ArchStdEvent": "STALL_SLOT", - "PublicDescription": "The Event counts slots per cycle in which no operations are sent to the rename unit from the frontend or from the rename unit to the backend for any reason (either frontend or backend stall). STALL_SLOT is the sum of STALL_SLOT_FRONTEND and STALL_SLOT_BACKEND." - }, - { - "ArchStdEvent": "STALL_BACKEND_MEM", - "PublicDescription": "The Event counts cycles when the backend is stalled because there is a pending demand Load request in progress in the last level Core cache. Last level cache in this CPU is Level 2, hence this Event counts same as STALL_BACKEND_L2D." - }, - { - "ArchStdEvent": "STALL_FRONTEND_MEMBOUND", - "PublicDescription": "The Event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the memory resources." - }, - { - "ArchStdEvent": "STALL_FRONTEND_L1I", - "PublicDescription": "The Event counts cycles when the frontend is stalled because there is an Instruction fetch request pending in the L1 I-cache." - }, - { - "ArchStdEvent": "STALL_FRONTEND_MEM", - "PublicDescription": "The Event counts cycles when the frontend is stalled because there is an Instruction fetch request pending in the last level Core cache. Last level cache in this CPU is Level 2, hence this Event counts rather than STALL_FRONTEND_L2I." - }, - { - "ArchStdEvent": "STALL_FRONTEND_TLB", - "PublicDescription": "The Event counts when the frontend is stalled on any TLB misses being handled. This Event also counts the TLB accesses made by hardware prefetches." - }, - { - "ArchStdEvent": "STALL_FRONTEND_CPUBOUND", - "PublicDescription": "The Event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the CPU resources excluding memory resources." - }, - { - "ArchStdEvent": "STALL_FRONTEND_FLOW", - "PublicDescription": "The Event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the Branch prediction unit." - }, - { - "ArchStdEvent": "STALL_FRONTEND_FLUSH", - "PublicDescription": "The Event counts cycles when the frontend could not send any micro-operations to the rename stage as the frontend is recovering from a machine flush or resteer. Example scenarios that cause a flush include Branch mispredictions, taken exceptions, microarchitectural flush etc." - }, - { - "ArchStdEvent": "STALL_BACKEND_MEMBOUND", - "PublicDescription": "The Event counts cycles when the backend could not accept any micro-operations due to resource constraints in the memory resources." - }, - { - "ArchStdEvent": "STALL_BACKEND_L1D", - "PublicDescription": "The Event counts cycles when the backend is stalled because there is a pending demand Load request in progress in the L1 D-cache." - }, - { - "ArchStdEvent": "STALL_BACKEND_TLB", - "PublicDescription": "The Event counts cycles when the backend is stalled on any demand TLB misses being handled." - }, - { - "ArchStdEvent": "STALL_BACKEND_ST", - "PublicDescription": "The Event counts cycles when the backend is stalled and there is a Store that has not reached the pre-commit stage." - }, - { - "ArchStdEvent": "STALL_BACKEND_CPUBOUND", - "PublicDescription": "The Event counts cycles when the backend could not accept any micro-operations due to any resource constraints in the CPU excluding memory resources." - }, - { - "ArchStdEvent": "STALL_BACKEND_BUSY", - "PublicDescription": "The Event counts cycles when the backend could not accept any micro-operations because the issue queues are full to take any operations for execution." - }, - { - "ArchStdEvent": "STALL_BACKEND_ILOCK", - "PublicDescription": "The Event counts cycles when the backend could not accept any micro-operations due to resource constraints imposed by input dependency." - }, - { - "ArchStdEvent": "STALL_BACKEND_RENAME", - "PublicDescription": "The Event counts cycles when backend is stalled even when operations are available from the frontend but at least one is not ready to be sent to the backend because no rename register is available." - }, - { - "EventCode": "0x0158", - "EventName": "FLAG_DISP_STALL", - "PublicDescription": "Rename stalled due to FRF(Flag register file) full." - }, - { - "EventCode": "0x0159", - "EventName": "GEN_DISP_STALL", - "PublicDescription": "Rename stalled due to GRF (General-purpose register file) full." - }, - { - "EventCode": "0x015a", - "EventName": "VEC_DISP_STALL", - "PublicDescription": "Rename stalled due to VRF (Vector register file) full." - }, - { - "EventCode": "0x015c", - "EventName": "SX_IQ_STALL", - "PublicDescription": "Dispatch stalled due to IQ full, SX." - }, - { - "EventCode": "0x015d", - "EventName": "MX_IQ_STALL", - "PublicDescription": "Dispatch stalled due to IQ full, MX." - }, - { - "EventCode": "0x015e", - "EventName": "LS_IQ_STALL", - "PublicDescription": "Dispatch stalled due to IQ full, LS." - }, - { - "EventCode": "0x015f", - "EventName": "VX_IQ_STALL", - "PublicDescription": "Dispatch stalled due to IQ full, VX." - }, - { - "EventCode": "0x0160", - "EventName": "MCQ_FULL_STALL", - "PublicDescription": "Dispatch stalled due to MCQ full." - }, - { - "EventCode": "0x01cf", - "EventName": "PRD_DISP_STALL", - "PublicDescription": "Rename stalled due to predicate registers (physical) are full." - }, - { - "EventCode": "0x01e0", - "EventName": "CSDB_STALL", - "PublicDescription": "Rename stalled due to CSDB." - }, - { - "EventCode": "0x01e2", - "EventName": "STALL_SLOT_FRONTEND_WITHOUT_MISPRED", - "PublicDescription": "Stall slot frontend during non-mispredicted branch. The Event counts the STALL_STOT_FRONTEND Events, except for the 4 cycles following a mispredicted branch Event or 4 cycles following a commit flush&restart Event." - } -] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json deleted file mode 100644 index 9a81a62a26462..0000000000000 --- a/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json +++ /dev/null @@ -1,158 +0,0 @@ -[ - { - "ArchStdEvent": "L1I_TLB_REFILL", - "PublicDescription": "The Event counts L1 Instruction TLB refills from any Instruction fetch (Demand, hardware prefetch and software preload accesses). If there are multiple misses in the TLB that are resolved by the refill, then this Event only counts once. This Event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB." - }, - { - "ArchStdEvent": "L1D_TLB_REFILL", - "PublicDescription": "The Event counts L1 Data TLB accesses that resulted in TLB refills. If there are multiple misses in the TLB that are resolved by the refill, then this Event only counts once. This Event counts for refills caused by preload Instructions or hardware prefetch accesses. This Event counts regardless of whether the miss hits in L2 or results in a translation table walk. This Event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This Event will not count on an access from an AT(address translation) Instruction. This Event is the sum of the L1D_TLB_REFILL_RD and L1D_TLB_REFILL_WR Events." - }, - { - "ArchStdEvent": "L1D_TLB", - "PublicDescription": "The Event counts L1 Data TLB accesses caused by any memory Load or Store operation. Note that Load or Store Instructions can be broken up into multiple memory operations. This Event does not count TLB maintenance operations." - }, - { - "ArchStdEvent": "L1I_TLB", - "PublicDescription": "The Event counts L1 Instruction TLB accesses (caused by Demand or hardware prefetch or software preload accesses), whether the access hits or misses in the TLB. This Event counts both demand accesses and prefetch or preload generated accesses. This Event is a superset of the L1I_TLB_REFILL Event." - }, - { - "ArchStdEvent": "L2D_TLB_REFILL", - "PublicDescription": "The Event counts L2 TLB refills caused by memory operations from both Data and Instruction fetch, except for those caused by TLB maintenance operations and hardware prefetches. This Event is the sum of the L2D_TLB_REFILL_RD and L2D_TLB_REFILL_WR Events." - }, - { - "ArchStdEvent": "L2D_TLB", - "PublicDescription": "The Event counts L2 TLB accesses except those caused by TLB maintenance operations. This Event is the sum of the L2D_TLB_RD and L2D_TLB_WR Events." - }, - { - "ArchStdEvent": "DTLB_WALK", - "PublicDescription": "The Event counts number of demand Data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this Event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this Event does not count walks triggered by TLB maintenance operations. This Event does not include prefetches." - }, - { - "ArchStdEvent": "ITLB_WALK", - "PublicDescription": "The Event counts number of Instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations. This Event does not include prefetches." - }, - { - "ArchStdEvent": "L1D_TLB_REFILL_RD", - "PublicDescription": "The Event counts L1 Data TLB refills caused by memory Read operations. If there are multiple misses in the TLB that are resolved by the refill, then this Event only counts once. This Event counts for refills caused by preload Instructions or hardware prefetch accesses. This Event counts regardless of whether the miss hits in L2 or results in a translation table walk. This Event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This Event will not count on an access from an Address Translation (AT) Instruction. This Event is a subset of the L1D_TLB_REFILL Event." - }, - { - "ArchStdEvent": "L1D_TLB_REFILL_WR", - "PublicDescription": "The Event counts L1 Data TLB refills caused by Data side memory Write operations. If there are multiple misses in the TLB that are resolved by the refill, then this Event only counts once. This Event counts for refills caused by preload Instructions or hardware prefetch accesses. This Event counts regardless of whether the miss hits in L2 or results in a translation table walk. This Event will not count if the table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This Event will not count with an access from an Address Translation (AT) Instruction. This Event is a subset of the L1D_TLB_REFILL Event." - }, - { - "ArchStdEvent": "L1D_TLB_RD", - "PublicDescription": "The Event counts L1 Data TLB accesses caused by memory Read operations. This Event counts whether the access hits or misses in the TLB. This Event does not count TLB maintenance operations." - }, - { - "ArchStdEvent": "L1D_TLB_WR", - "PublicDescription": "The Event counts any L1 Data side TLB accesses caused by memory Write operations. This Event counts whether the access hits or misses in the TLB. This Event does not count TLB maintenance operations." - }, - { - "ArchStdEvent": "L2D_TLB_REFILL_RD", - "PublicDescription": "The Event counts L2 TLB refills caused by memory Read operations from both Data and Instruction fetch except for those caused by TLB maintenance operations or hardware prefetches. This Event is a subset of the L2D_TLB_REFILL Event." - }, - { - "ArchStdEvent": "L2D_TLB_REFILL_WR", - "PublicDescription": "The Event counts L2 TLB refills caused by memory Write operations from both Data and Instruction fetch except for those caused by TLB maintenance operations. This Event is a subset of the L2D_TLB_REFILL Event." - }, - { - "ArchStdEvent": "L2D_TLB_RD", - "PublicDescription": "The Event counts L2 TLB accesses caused by memory Read operations from both Data and Instruction fetch except for those caused by TLB maintenance operations. This Event is a subset of the L2D_TLB Event." - }, - { - "ArchStdEvent": "L2D_TLB_WR", - "PublicDescription": "The Event counts L2 TLB accesses caused by memory Write operations from both Data and Instruction fetch except for those caused by TLB maintenance operations. This Event is a subset of the L2D_TLB Event." - }, - { - "ArchStdEvent": "DTLB_WALK_PERCYC", - "PublicDescription": "The Event counts the number of Data translation table walks in progress per cycle." - }, - { - "ArchStdEvent": "ITLB_WALK_PERCYC", - "PublicDescription": "The Event counts the number of Instruction translation table walks in progress per cycle." - }, - { - "ArchStdEvent": "L1D_TLB_RW", - "PublicDescription": "The Event counts L1 Data TLB demand accesses caused by memory Read or Write operations. This Event counts whether the access hits or misses in the TLB. This Event does not count TLB maintenance operations." - }, - { - "ArchStdEvent": "L1I_TLB_RD", - "PublicDescription": "The Event counts L1 Instruction TLB demand accesses whether the access hits or misses in the TLB." - }, - { - "ArchStdEvent": "L1D_TLB_PRFM", - "PublicDescription": "The Event counts L1 Data TLB accesses generated by software prefetch or preload memory accesses. Load or Store Instructions can be broken into multiple memory operations. This Event does not count TLB maintenance operations." - }, - { - "ArchStdEvent": "L1I_TLB_PRFM", - "PublicDescription": "The Event counts L1 Instruction TLB accesses generated by software preload or prefetch Instructions. This Event counts whether the access hits or misses in the TLB. This Event does not count TLB maintenance operations." - }, - { - "ArchStdEvent": "DTLB_HWUPD", - "PublicDescription": "The Event counts number of memory accesses triggered by a Data translation table walk and performing an update of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that this Event counts accesses triggered by software preloads, but not accesses triggered by hardware prefetchers." - }, - { - "ArchStdEvent": "ITLB_HWUPD", - "PublicDescription": "The Event counts number of memory accesses triggered by an Instruction translation table walk and performing an update of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD." - }, - { - "ArchStdEvent": "DTLB_STEP", - "PublicDescription": "The Event counts number of memory accesses triggered by a demand Data translation table walk and performing a Read of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that this Event counts accesses triggered by software preloads, but not accesses triggered by hardware prefetchers." - }, - { - "ArchStdEvent": "ITLB_STEP", - "PublicDescription": "The Event counts number of memory accesses triggered by an Instruction translation table walk and performing a Read of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD." - }, - { - "ArchStdEvent": "DTLB_WALK_LARGE", - "PublicDescription": "The Event counts number of demand Data translation table walks caused by a miss in the L2 TLB and yielding a large page. The set of large pages is defined as all pages with a final size higher than or equal to 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. If DTLB_WALK_BLOCK is implemented, then it is an alias for this Event in this family. Note that partial translations that cause a translation table walk are also counted. Also note that this Event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "ITLB_WALK_LARGE", - "PublicDescription": "The Event counts number of Instruction translation table walks caused by a miss in the L2 TLB and yielding a large page. The set of large pages is defined as all pages with a final size higher than or equal to 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. In this family, this is equal to ITLB_WALK_BLOCK Event. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "DTLB_WALK_SMALL", - "PublicDescription": "The Event counts number of Data translation table walks caused by a miss in the L2 TLB and yielding a small page. The set of small pages is defined as all pages with a final size lower than 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. If DTLB_WALK_PAGE Event is implemented, then it is an alias for this Event in this family. Note that partial translations that cause a translation table walk are also counted. Also note that this Event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "ITLB_WALK_SMALL", - "PublicDescription": "The Event counts number of Instruction translation table walks caused by a miss in the L2 TLB and yielding a small page. The set of small pages is defined as all pages with a final size lower than 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. In this family, this is equal to ITLB_WALK_PAGE Event. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "DTLB_WALK_RW", - "PublicDescription": "The Event counts number of demand Data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "ITLB_WALK_RD", - "PublicDescription": "The Event counts number of demand Instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "DTLB_WALK_PRFM", - "PublicDescription": "The Event counts number of software prefetches or preloads generated Data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "ArchStdEvent": "ITLB_WALK_PRFM", - "PublicDescription": "The Event counts number of software prefetches or preloads generated Instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this Event does not count walks triggered by TLB maintenance operations." - }, - { - "EventCode": "0x010e", - "EventName": "L1D_TLB_REFILL_RD_PF", - "PublicDescription": "L1 Data TLB refill, Read, prefetch." - }, - { - "EventCode": "0x010f", - "EventName": "L2TLB_PF_REFILL", - "PublicDescription": "L2 Data TLB refill, Read, prefetch. The Event counts MMU refills due to internal PFStream requests." - }, - { - "EventCode": "0x0223", - "EventName": "L1I_TLB_REFILL_RD", - "PublicDescription": "L1 Instruction TLB refills due to Demand miss." - }, - { - "EventCode": "0x0224", - "EventName": "L1I_TLB_REFILL_PRFM", - "PublicDescription": "L1 Instruction TLB refills due to Software prefetch miss." - } -] From 2dcaf0ba314953760bf332707328d1e362c077ab Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:25:26 -0700 Subject: [PATCH 02/17] Revert "NVIDIA: VR: SAUCE: perf: add NVIDIA Tegra410 C2C PMU" This reverts commit 4defdaeacc4b11f5de335a9fc2507dc2df77eb4b. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 151 --- drivers/perf/Kconfig | 7 - drivers/perf/Makefile | 1 - drivers/perf/nvidia_t410_c2c_pmu.c | 1061 ----------------- 4 files changed, 1220 deletions(-) delete mode 100644 drivers/perf/nvidia_t410_c2c_pmu.c diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index f81f356debe1f..11fc1c88346a0 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -9,9 +9,6 @@ metrics like memory bandwidth, latency, and utilization: * PCIE * PCIE-TGT * CPU Memory (CMEM) Latency -* NVLink-C2C -* NV-CLink -* NV-DLink PMU Driver ---------- @@ -370,151 +367,3 @@ see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. Example usage:: perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' - -NVLink-C2C PMU --------------- - -This PMU monitors latency events of memory read/write requests that pass through -the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available -in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC). - -The events and configuration options of this PMU device are available in sysfs, -see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_. - -The list of events: - - * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. - * IN_RD_REQ: the number of incoming read requests. - * IN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests. - * IN_WR_REQ: the number of incoming write requests. - * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. - * OUT_RD_REQ: the number of outgoing read requests. - * OUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests. - * OUT_WR_REQ: the number of outgoing write requests. - * CYCLES: NVLink-C2C interface cycle counts. - -The incoming events count the reads/writes from remote device to the SoC. -The outgoing events count the reads/writes from the SoC to remote device. - -The sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_/peer -contains the information about the connected device. - -When the C2C interface is connected to GPU(s), the user can use the -"gpu_mask" parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU -index, e.g. "gpu_mask=0x1" corresponds to GPU 0 and "gpu_mask=0x3" is for GPU 0 and 1. -The PMU will monitor all GPUs by default if not specified. - -When connected to another SoC, only the read events are available. - -The events can be used to calculate the average latency of the read/write requests:: - - C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS - - IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ - IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ - - IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ - IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ - - OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ - OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ - - OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ - OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ - -Example usage: - - * Count incoming traffic from all GPUs connected via NVLink-C2C:: - - perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/ - - * Count incoming traffic from GPU 0 connected via NVLink-C2C:: - - perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/ - - * Count incoming traffic from GPU 1 connected via NVLink-C2C:: - - perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/ - - * Count outgoing traffic to all GPUs connected via NVLink-C2C:: - - perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/ - - * Count outgoing traffic to GPU 0 connected via NVLink-C2C:: - - perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/ - - * Count outgoing traffic to GPU 1 connected via NVLink-C2C:: - - perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/ - -NV-CLink PMU ------------- - -This PMU monitors latency events of memory read requests that pass through -the NV-CLINK interface. Bandwidth events are not available in this PMU. -In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410 -SoC and this PMU only counts read traffic. - -The events and configuration options of this PMU device are available in sysfs, -see /sys/bus/event_source/devices/nvidia_nvclink_pmu_. - -The list of events: - - * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. - * IN_RD_REQ: the number of incoming read requests. - * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. - * OUT_RD_REQ: the number of outgoing read requests. - * CYCLES: NV-CLINK interface cycle counts. - -The incoming events count the reads from remote device to the SoC. -The outgoing events count the reads from the SoC to remote device. - -The events can be used to calculate the average latency of the read requests:: - - CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS - - IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ - IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ - - OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ - OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ - -Example usage: - - * Count incoming read traffic from remote SoC connected via NV-CLINK:: - - perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/ - - * Count outgoing read traffic to remote SoC connected via NV-CLINK:: - - perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/ - -NV-DLink PMU ------------- - -This PMU monitors latency events of memory read requests that pass through -the NV-DLINK interface. Bandwidth events are not available in this PMU. -In Tegra410 SoC, this PMU only counts CXL memory read traffic. - -The events and configuration options of this PMU device are available in sysfs, -see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_. - -The list of events: - - * IN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory. - * IN_RD_REQ: the number of read requests to CXL memory. - * CYCLES: NV-DLINK interface cycle counts. - -The events can be used to calculate the average latency of the read requests:: - - DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS - - IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ - IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ - -Example usage: - - * Count read events to CXL memory:: - - perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}' diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 7ee36efe6bc0f..9fed3c41d5ea0 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -318,11 +318,4 @@ config NVIDIA_TEGRA410_CMEM_LATENCY_PMU Enable perf support for CPU memory latency counters monitoring on NVIDIA Tegra410 SoC. -config NVIDIA_TEGRA410_C2C_PMU - tristate "NVIDIA Tegra410 C2C PMU" - depends on ARM64 && ACPI - help - Enable perf support for counters in NVIDIA C2C interface of NVIDIA - Tegra410 SoC. - endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index eb8a022dad9a7..4aa6aad393c2d 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -36,4 +36,3 @@ obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o -obj-$(CONFIG_NVIDIA_TEGRA410_C2C_PMU) += nvidia_t410_c2c_pmu.o diff --git a/drivers/perf/nvidia_t410_c2c_pmu.c b/drivers/perf/nvidia_t410_c2c_pmu.c deleted file mode 100644 index 362e0e5f8b24c..0000000000000 --- a/drivers/perf/nvidia_t410_c2c_pmu.c +++ /dev/null @@ -1,1061 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NVIDIA Tegra410 C2C PMU driver. - * - * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* The C2C interface types in Tegra410. */ -#define C2C_TYPE_NVLINK 0x0 -#define C2C_TYPE_NVCLINK 0x1 -#define C2C_TYPE_NVDLINK 0x2 -#define C2C_TYPE_COUNT 0x3 - -/* The type of the peer device connected to the C2C interface. */ -#define C2C_PEER_TYPE_CPU 0x0 -#define C2C_PEER_TYPE_GPU 0x1 -#define C2C_PEER_TYPE_CXLMEM 0x2 -#define C2C_PEER_TYPE_COUNT 0x3 - -/* The number of peer devices can be connected to the C2C interface. */ -#define C2C_NR_PEER_CPU 0x1 -#define C2C_NR_PEER_GPU 0x2 -#define C2C_NR_PEER_CXLMEM 0x1 -#define C2C_NR_PEER_MAX 0x2 - -/* Number of instances on each interface. */ -#define C2C_NR_INST_NVLINK 14 -#define C2C_NR_INST_NVCLINK 12 -#define C2C_NR_INST_NVDLINK 16 -#define C2C_NR_INST_MAX 16 - -/* Register offsets. */ -#define C2C_CTRL 0x864 -#define C2C_IN_STATUS 0x868 -#define C2C_CYCLE_CNTR 0x86c -#define C2C_IN_RD_CUM_OUTS_CNTR 0x874 -#define C2C_IN_RD_REQ_CNTR 0x87c -#define C2C_IN_WR_CUM_OUTS_CNTR 0x884 -#define C2C_IN_WR_REQ_CNTR 0x88c -#define C2C_OUT_STATUS 0x890 -#define C2C_OUT_RD_CUM_OUTS_CNTR 0x898 -#define C2C_OUT_RD_REQ_CNTR 0x8a0 -#define C2C_OUT_WR_CUM_OUTS_CNTR 0x8a8 -#define C2C_OUT_WR_REQ_CNTR 0x8b0 - -/* C2C_IN_STATUS register field. */ -#define C2C_IN_STATUS_CYCLE_OVF BIT(0) -#define C2C_IN_STATUS_IN_RD_CUM_OUTS_OVF BIT(1) -#define C2C_IN_STATUS_IN_RD_REQ_OVF BIT(2) -#define C2C_IN_STATUS_IN_WR_CUM_OUTS_OVF BIT(3) -#define C2C_IN_STATUS_IN_WR_REQ_OVF BIT(4) - -/* C2C_OUT_STATUS register field. */ -#define C2C_OUT_STATUS_OUT_RD_CUM_OUTS_OVF BIT(0) -#define C2C_OUT_STATUS_OUT_RD_REQ_OVF BIT(1) -#define C2C_OUT_STATUS_OUT_WR_CUM_OUTS_OVF BIT(2) -#define C2C_OUT_STATUS_OUT_WR_REQ_OVF BIT(3) - -/* Events. */ -#define C2C_EVENT_CYCLES 0x0 -#define C2C_EVENT_IN_RD_CUM_OUTS 0x1 -#define C2C_EVENT_IN_RD_REQ 0x2 -#define C2C_EVENT_IN_WR_CUM_OUTS 0x3 -#define C2C_EVENT_IN_WR_REQ 0x4 -#define C2C_EVENT_OUT_RD_CUM_OUTS 0x5 -#define C2C_EVENT_OUT_RD_REQ 0x6 -#define C2C_EVENT_OUT_WR_CUM_OUTS 0x7 -#define C2C_EVENT_OUT_WR_REQ 0x8 - -#define C2C_NUM_EVENTS 0x9 -#define C2C_MASK_EVENT 0xFF -#define C2C_MAX_ACTIVE_EVENTS 32 - -#define C2C_ACTIVE_CPU_MASK 0x0 -#define C2C_ASSOCIATED_CPU_MASK 0x1 - -/* - * Maximum poll count for reading counter value using high-low-high sequence. - */ -#define HILOHI_MAX_POLL 1000 - -static unsigned long nv_c2c_pmu_cpuhp_state; - -/* PMU descriptor. */ - -/* Tracks the events assigned to the PMU for a given logical index. */ -struct nv_c2c_pmu_hw_events { - /* The events that are active. */ - struct perf_event *events[C2C_MAX_ACTIVE_EVENTS]; - - /* - * Each bit indicates a logical counter is being used (or not) for an - * event. - */ - DECLARE_BITMAP(used_ctrs, C2C_MAX_ACTIVE_EVENTS); -}; - -struct nv_c2c_pmu { - struct pmu pmu; - struct device *dev; - struct acpi_device *acpi_dev; - - const char *name; - const char *identifier; - - unsigned int c2c_type; - unsigned int peer_type; - unsigned int socket; - unsigned int nr_inst; - unsigned int nr_peer; - unsigned long peer_insts[C2C_NR_PEER_MAX][BITS_TO_LONGS(C2C_NR_INST_MAX)]; - u32 filter_default; - - struct nv_c2c_pmu_hw_events hw_events; - - cpumask_t associated_cpus; - cpumask_t active_cpu; - - struct hlist_node cpuhp_node; - - struct attribute **formats; - const struct attribute_group *attr_groups[6]; - - void __iomem *base_broadcast; - void __iomem *base[C2C_NR_INST_MAX]; -}; - -#define to_c2c_pmu(p) (container_of(p, struct nv_c2c_pmu, pmu)) - -/* Get event type from perf_event. */ -static inline u32 get_event_type(struct perf_event *event) -{ - return (event->attr.config) & C2C_MASK_EVENT; -} - -static inline u32 get_filter_mask(struct perf_event *event) -{ - u32 filter; - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); - - filter = ((u32)event->attr.config1) & c2c_pmu->filter_default; - if (filter == 0) - filter = c2c_pmu->filter_default; - - return filter; -} - -/* PMU operations. */ - -static int nv_c2c_pmu_get_event_idx(struct nv_c2c_pmu_hw_events *hw_events, - struct perf_event *event) -{ - u32 idx; - - idx = find_first_zero_bit(hw_events->used_ctrs, C2C_MAX_ACTIVE_EVENTS); - if (idx >= C2C_MAX_ACTIVE_EVENTS) - return -EAGAIN; - - set_bit(idx, hw_events->used_ctrs); - - return idx; -} - -static bool -nv_c2c_pmu_validate_event(struct pmu *pmu, - struct nv_c2c_pmu_hw_events *hw_events, - struct perf_event *event) -{ - if (is_software_event(event)) - return true; - - /* Reject groups spanning multiple HW PMUs. */ - if (event->pmu != pmu) - return false; - - return nv_c2c_pmu_get_event_idx(hw_events, event) >= 0; -} - -/* - * Make sure the group of events can be scheduled at once - * on the PMU. - */ -static bool nv_c2c_pmu_validate_group(struct perf_event *event) -{ - struct perf_event *sibling, *leader = event->group_leader; - struct nv_c2c_pmu_hw_events fake_hw_events; - - if (event->group_leader == event) - return true; - - memset(&fake_hw_events, 0, sizeof(fake_hw_events)); - - if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, leader)) - return false; - - for_each_sibling_event(sibling, leader) { - if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, - sibling)) - return false; - } - - return nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, event); -} - -static int nv_c2c_pmu_event_init(struct perf_event *event) -{ - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - u32 event_type = get_event_type(event); - - if (event->attr.type != event->pmu->type || - event_type >= C2C_NUM_EVENTS) - return -ENOENT; - - /* - * Following other "uncore" PMUs, we do not support sampling mode or - * attach to a task (per-process mode). - */ - if (is_sampling_event(event)) { - dev_dbg(c2c_pmu->pmu.dev, "Can't support sampling events\n"); - return -EOPNOTSUPP; - } - - if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) { - dev_dbg(c2c_pmu->pmu.dev, "Can't support per-task counters\n"); - return -EINVAL; - } - - /* - * Make sure the CPU assignment is on one of the CPUs associated with - * this PMU. - */ - if (!cpumask_test_cpu(event->cpu, &c2c_pmu->associated_cpus)) { - dev_dbg(c2c_pmu->pmu.dev, - "Requested cpu is not associated with the PMU\n"); - return -EINVAL; - } - - /* Enforce the current active CPU to handle the events in this PMU. */ - event->cpu = cpumask_first(&c2c_pmu->active_cpu); - if (event->cpu >= nr_cpu_ids) - return -EINVAL; - - if (!nv_c2c_pmu_validate_group(event)) - return -EINVAL; - - hwc->idx = -1; - hwc->config = event_type; - - return 0; -} - -/* - * Read 64-bit register as a pair of 32-bit registers using hi-lo-hi sequence. - */ -static u64 read_reg64_hilohi(const void __iomem *addr, u32 max_poll_count) -{ - u32 val_lo, val_hi; - u64 val; - - /* Use high-low-high sequence to avoid tearing */ - do { - if (max_poll_count-- == 0) { - pr_err("NV C2C PMU: timeout hi-low-high sequence\n"); - return 0; - } - - val_hi = readl(addr + 4); - val_lo = readl(addr); - } while (val_hi != readl(addr + 4)); - - val = (((u64)val_hi << 32) | val_lo); - - return val; -} - -static void nv_c2c_pmu_check_status(struct nv_c2c_pmu *c2c_pmu, u32 instance) -{ - u32 in_status, out_status; - - in_status = readl(c2c_pmu->base[instance] + C2C_IN_STATUS); - out_status = readl(c2c_pmu->base[instance] + C2C_OUT_STATUS); - - if (in_status || out_status) - dev_warn(c2c_pmu->dev, - "C2C PMU overflow in: 0x%x, out: 0x%x\n", - in_status, out_status); -} - -static u32 nv_c2c_ctr_offset[C2C_NUM_EVENTS] = { - [C2C_EVENT_CYCLES] = C2C_CYCLE_CNTR, - [C2C_EVENT_IN_RD_CUM_OUTS] = C2C_IN_RD_CUM_OUTS_CNTR, - [C2C_EVENT_IN_RD_REQ] = C2C_IN_RD_REQ_CNTR, - [C2C_EVENT_IN_WR_CUM_OUTS] = C2C_IN_WR_CUM_OUTS_CNTR, - [C2C_EVENT_IN_WR_REQ] = C2C_IN_WR_REQ_CNTR, - [C2C_EVENT_OUT_RD_CUM_OUTS] = C2C_OUT_RD_CUM_OUTS_CNTR, - [C2C_EVENT_OUT_RD_REQ] = C2C_OUT_RD_REQ_CNTR, - [C2C_EVENT_OUT_WR_CUM_OUTS] = C2C_OUT_WR_CUM_OUTS_CNTR, - [C2C_EVENT_OUT_WR_REQ] = C2C_OUT_WR_REQ_CNTR, -}; - -static u64 nv_c2c_pmu_read_counter(struct perf_event *event) -{ - u32 ctr_id, ctr_offset, filter_mask, filter_idx, inst_idx; - unsigned long *inst_mask; - DECLARE_BITMAP(filter_bitmap, C2C_NR_PEER_MAX); - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); - u64 val = 0; - - filter_mask = get_filter_mask(event); - bitmap_from_arr32(filter_bitmap, &filter_mask, c2c_pmu->nr_peer); - - ctr_id = event->hw.config; - ctr_offset = nv_c2c_ctr_offset[ctr_id]; - - for_each_set_bit(filter_idx, filter_bitmap, c2c_pmu->nr_peer) { - inst_mask = c2c_pmu->peer_insts[filter_idx]; - for_each_set_bit(inst_idx, inst_mask, c2c_pmu->nr_inst) { - nv_c2c_pmu_check_status(c2c_pmu, inst_idx); - - /* - * Each instance share same clock and the driver always - * enables all instances. So we can use the counts from - * one instance for cycle counter. - */ - if (ctr_id == C2C_EVENT_CYCLES) - return read_reg64_hilohi( - c2c_pmu->base[inst_idx] + ctr_offset, - HILOHI_MAX_POLL); - - /* - * For other events, sum up the counts from all instances. - */ - val += read_reg64_hilohi( - c2c_pmu->base[inst_idx] + ctr_offset, - HILOHI_MAX_POLL); - } - } - - return val; -} - -static void nv_c2c_pmu_event_update(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 prev, now; - - do { - prev = local64_read(&hwc->prev_count); - now = nv_c2c_pmu_read_counter(event); - } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); - - local64_add(now - prev, &event->count); -} - -static void nv_c2c_pmu_start(struct perf_event *event, int pmu_flags) -{ - event->hw.state = 0; -} - -static void nv_c2c_pmu_stop(struct perf_event *event, int pmu_flags) -{ - event->hw.state |= PERF_HES_STOPPED; -} - -static int nv_c2c_pmu_add(struct perf_event *event, int flags) -{ - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); - struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; - struct hw_perf_event *hwc = &event->hw; - int idx; - - if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), - &c2c_pmu->associated_cpus))) - return -ENOENT; - - idx = nv_c2c_pmu_get_event_idx(hw_events, event); - if (idx < 0) - return idx; - - hw_events->events[idx] = event; - hwc->idx = idx; - hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; - - if (flags & PERF_EF_START) - nv_c2c_pmu_start(event, PERF_EF_RELOAD); - - /* Propagate changes to the userspace mapping. */ - perf_event_update_userpage(event); - - return 0; -} - -static void nv_c2c_pmu_del(struct perf_event *event, int flags) -{ - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); - struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - nv_c2c_pmu_stop(event, PERF_EF_UPDATE); - - hw_events->events[idx] = NULL; - - clear_bit(idx, hw_events->used_ctrs); - - perf_event_update_userpage(event); -} - -static void nv_c2c_pmu_read(struct perf_event *event) -{ - nv_c2c_pmu_event_update(event); -} - -static void nv_c2c_pmu_enable(struct pmu *pmu) -{ - void __iomem *bcast; - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); - - /* Check if any filter is enabled. */ - if (bitmap_empty(c2c_pmu->hw_events.used_ctrs, C2C_MAX_ACTIVE_EVENTS)) - return; - - /* Enable all the counters. */ - bcast = c2c_pmu->base_broadcast; - writel(0x1UL, bcast + C2C_CTRL); -} - -static void nv_c2c_pmu_disable(struct pmu *pmu) -{ - unsigned int idx; - void __iomem *bcast; - struct perf_event *event; - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); - - /* Disable all the counters. */ - bcast = c2c_pmu->base_broadcast; - writel(0x0UL, bcast + C2C_CTRL); - - /* - * The counters will start from 0 again on restart. - * Update the events immediately to avoid losing the counts. - */ - for_each_set_bit(idx, c2c_pmu->hw_events.used_ctrs, - C2C_MAX_ACTIVE_EVENTS) { - event = c2c_pmu->hw_events.events[idx]; - - if (!event) - continue; - - nv_c2c_pmu_event_update(event); - - local64_set(&event->hw.prev_count, 0ULL); - } -} - -/* PMU identifier attribute. */ - -static ssize_t nv_c2c_pmu_identifier_show(struct device *dev, - struct device_attribute *attr, - char *page) -{ - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); - - return sysfs_emit(page, "%s\n", c2c_pmu->identifier); -} - -static struct device_attribute nv_c2c_pmu_identifier_attr = - __ATTR(identifier, 0444, nv_c2c_pmu_identifier_show, NULL); - -static struct attribute *nv_c2c_pmu_identifier_attrs[] = { - &nv_c2c_pmu_identifier_attr.attr, - NULL, -}; - -static struct attribute_group nv_c2c_pmu_identifier_attr_group = { - .attrs = nv_c2c_pmu_identifier_attrs, -}; - -/* Peer attribute. */ - -static ssize_t nv_c2c_pmu_peer_show(struct device *dev, - struct device_attribute *attr, - char *page) -{ - const char *peer_type[C2C_PEER_TYPE_COUNT] = { - [C2C_PEER_TYPE_CPU] = "cpu", - [C2C_PEER_TYPE_GPU] = "gpu", - [C2C_PEER_TYPE_CXLMEM] = "cxlmem", - }; - - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); - return sysfs_emit(page, "nr_%s=%u\n", peer_type[c2c_pmu->peer_type], - c2c_pmu->nr_peer); -} - -static struct device_attribute nv_c2c_pmu_peer_attr = - __ATTR(peer, 0444, nv_c2c_pmu_peer_show, NULL); - -static struct attribute *nv_c2c_pmu_peer_attrs[] = { - &nv_c2c_pmu_peer_attr.attr, - NULL, -}; - -static struct attribute_group nv_c2c_pmu_peer_attr_group = { - .attrs = nv_c2c_pmu_peer_attrs, -}; - -/* Format attributes. */ - -#define NV_C2C_PMU_EXT_ATTR(_name, _func, _config) \ - (&((struct dev_ext_attribute[]){ \ - { \ - .attr = __ATTR(_name, 0444, _func, NULL), \ - .var = (void *)_config \ - } \ - })[0].attr.attr) - -#define NV_C2C_PMU_FORMAT_ATTR(_name, _config) \ - NV_C2C_PMU_EXT_ATTR(_name, device_show_string, _config) - -#define NV_C2C_PMU_FORMAT_EVENT_ATTR \ - NV_C2C_PMU_FORMAT_ATTR(event, "config:0-3") - -static struct attribute *nv_c2c_nvlink_pmu_formats[] = { - NV_C2C_PMU_FORMAT_EVENT_ATTR, - NV_C2C_PMU_FORMAT_ATTR(gpu_mask, "config1:0-1"), - NULL, -}; - -static struct attribute *nv_c2c_pmu_formats[] = { - NV_C2C_PMU_FORMAT_EVENT_ATTR, - NULL, -}; - -static struct attribute_group * -nv_c2c_pmu_alloc_format_attr_group(struct nv_c2c_pmu *c2c_pmu) -{ - struct attribute_group *format_group; - struct device *dev = c2c_pmu->dev; - - format_group = - devm_kzalloc(dev, sizeof(struct attribute_group), GFP_KERNEL); - if (!format_group) - return NULL; - - format_group->name = "format"; - format_group->attrs = c2c_pmu->formats; - - return format_group; -} - -/* Event attributes. */ - -static ssize_t nv_c2c_pmu_sysfs_event_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct perf_pmu_events_attr *pmu_attr; - - pmu_attr = container_of(attr, typeof(*pmu_attr), attr); - return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); -} - -#define NV_C2C_PMU_EVENT_ATTR(_name, _config) \ - PMU_EVENT_ATTR_ID(_name, nv_c2c_pmu_sysfs_event_show, _config) - -static struct attribute *nv_c2c_pmu_events[] = { - NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), - NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), - NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), - NV_C2C_PMU_EVENT_ATTR(in_wr_cum_outs, C2C_EVENT_IN_WR_CUM_OUTS), - NV_C2C_PMU_EVENT_ATTR(in_wr_req, C2C_EVENT_IN_WR_REQ), - NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), - NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), - NV_C2C_PMU_EVENT_ATTR(out_wr_cum_outs, C2C_EVENT_OUT_WR_CUM_OUTS), - NV_C2C_PMU_EVENT_ATTR(out_wr_req, C2C_EVENT_OUT_WR_REQ), - NULL -}; - -static umode_t -nv_c2c_pmu_event_attr_is_visible(struct kobject *kobj, struct attribute *attr, - int unused) -{ - struct device *dev = kobj_to_dev(kobj); - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); - struct perf_pmu_events_attr *eattr; - - eattr = container_of(attr, typeof(*eattr), attr.attr); - - if (c2c_pmu->c2c_type == C2C_TYPE_NVDLINK) { - /* Only incoming reads are available. */ - switch (eattr->id) { - case C2C_EVENT_IN_WR_CUM_OUTS: - case C2C_EVENT_IN_WR_REQ: - case C2C_EVENT_OUT_RD_CUM_OUTS: - case C2C_EVENT_OUT_RD_REQ: - case C2C_EVENT_OUT_WR_CUM_OUTS: - case C2C_EVENT_OUT_WR_REQ: - return 0; - default: - return attr->mode; - } - } else { - /* Hide the write events if C2C connected to another SoC. */ - if (c2c_pmu->peer_type == C2C_PEER_TYPE_CPU) { - switch (eattr->id) { - case C2C_EVENT_IN_WR_CUM_OUTS: - case C2C_EVENT_IN_WR_REQ: - case C2C_EVENT_OUT_WR_CUM_OUTS: - case C2C_EVENT_OUT_WR_REQ: - return 0; - default: - return attr->mode; - } - } - } - - return attr->mode; -} - -static const struct attribute_group nv_c2c_pmu_events_group = { - .name = "events", - .attrs = nv_c2c_pmu_events, - .is_visible = nv_c2c_pmu_event_attr_is_visible, -}; - -/* Cpumask attributes. */ - -static ssize_t nv_c2c_pmu_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pmu *pmu = dev_get_drvdata(dev); - struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); - struct dev_ext_attribute *eattr = - container_of(attr, struct dev_ext_attribute, attr); - unsigned long mask_id = (unsigned long)eattr->var; - const cpumask_t *cpumask; - - switch (mask_id) { - case C2C_ACTIVE_CPU_MASK: - cpumask = &c2c_pmu->active_cpu; - break; - case C2C_ASSOCIATED_CPU_MASK: - cpumask = &c2c_pmu->associated_cpus; - break; - default: - return 0; - } - return cpumap_print_to_pagebuf(true, buf, cpumask); -} - -#define NV_C2C_PMU_CPUMASK_ATTR(_name, _config) \ - NV_C2C_PMU_EXT_ATTR(_name, nv_c2c_pmu_cpumask_show, \ - (unsigned long)_config) - -static struct attribute *nv_c2c_pmu_cpumask_attrs[] = { - NV_C2C_PMU_CPUMASK_ATTR(cpumask, C2C_ACTIVE_CPU_MASK), - NV_C2C_PMU_CPUMASK_ATTR(associated_cpus, C2C_ASSOCIATED_CPU_MASK), - NULL, -}; - -static const struct attribute_group nv_c2c_pmu_cpumask_attr_group = { - .attrs = nv_c2c_pmu_cpumask_attrs, -}; - -/* Per PMU device attribute groups. */ - -static int nv_c2c_pmu_alloc_attr_groups(struct nv_c2c_pmu *c2c_pmu) -{ - const struct attribute_group **attr_groups = c2c_pmu->attr_groups; - - attr_groups[0] = nv_c2c_pmu_alloc_format_attr_group(c2c_pmu); - attr_groups[1] = &nv_c2c_pmu_events_group; - attr_groups[2] = &nv_c2c_pmu_cpumask_attr_group; - attr_groups[3] = &nv_c2c_pmu_identifier_attr_group; - attr_groups[4] = &nv_c2c_pmu_peer_attr_group; - - if (!attr_groups[0]) - return -ENOMEM; - - return 0; -} - -static int nv_c2c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) -{ - struct nv_c2c_pmu *c2c_pmu = - hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); - - if (!cpumask_test_cpu(cpu, &c2c_pmu->associated_cpus)) - return 0; - - /* If the PMU is already managed, there is nothing to do */ - if (!cpumask_empty(&c2c_pmu->active_cpu)) - return 0; - - /* Use this CPU for event counting */ - cpumask_set_cpu(cpu, &c2c_pmu->active_cpu); - - return 0; -} - -static int nv_c2c_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) -{ - unsigned int dst; - - struct nv_c2c_pmu *c2c_pmu = - hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); - - /* Nothing to do if this CPU doesn't own the PMU */ - if (!cpumask_test_and_clear_cpu(cpu, &c2c_pmu->active_cpu)) - return 0; - - /* Choose a new CPU to migrate ownership of the PMU to */ - dst = cpumask_any_and_but(&c2c_pmu->associated_cpus, - cpu_online_mask, cpu); - if (dst >= nr_cpu_ids) - return 0; - - /* Use this CPU for event counting */ - perf_pmu_migrate_context(&c2c_pmu->pmu, cpu, dst); - cpumask_set_cpu(dst, &c2c_pmu->active_cpu); - - return 0; -} - -static int nv_c2c_pmu_get_cpus(struct nv_c2c_pmu *c2c_pmu) -{ - int ret = 0, socket = c2c_pmu->socket, cpu; - - for_each_possible_cpu(cpu) { - if (cpu_to_node(cpu) == socket) - cpumask_set_cpu(cpu, &c2c_pmu->associated_cpus); - } - - if (cpumask_empty(&c2c_pmu->associated_cpus)) { - dev_dbg(c2c_pmu->dev, - "No cpu associated with C2C PMU socket-%u\n", socket); - ret = -ENODEV; - } - - return ret; -} - -static int nv_c2c_pmu_init_socket(struct nv_c2c_pmu *c2c_pmu) -{ - const char *uid_str; - int ret, socket; - - uid_str = acpi_device_uid(c2c_pmu->acpi_dev); - if (!uid_str) { - ret = -ENODEV; - goto fail; - } - - ret = kstrtou32(uid_str, 0, &socket); - if (ret) - goto fail; - - c2c_pmu->socket = socket; - return 0; - -fail: - dev_err(c2c_pmu->dev, "Failed to initialize socket\n"); - return ret; -} - -static int nv_c2c_pmu_init_id(struct nv_c2c_pmu *c2c_pmu) -{ - const char *name_fmt[C2C_TYPE_COUNT] = { - [C2C_TYPE_NVLINK] = "nvidia_nvlink_c2c_pmu_%u", - [C2C_TYPE_NVCLINK] = "nvidia_nvclink_pmu_%u", - [C2C_TYPE_NVDLINK] = "nvidia_nvdlink_pmu_%u", - }; - - char *name; - int ret; - - name = devm_kasprintf(c2c_pmu->dev, GFP_KERNEL, - name_fmt[c2c_pmu->c2c_type], c2c_pmu->socket); - if (!name) { - ret = -ENOMEM; - goto fail; - } - - c2c_pmu->name = name; - - c2c_pmu->identifier = acpi_device_hid(c2c_pmu->acpi_dev); - - return 0; - -fail: - dev_err(c2c_pmu->dev, "Failed to initialize name\n"); - return ret; -} - -static int nv_c2c_pmu_init_filter(struct nv_c2c_pmu *c2c_pmu) -{ - u32 cpu_en = 0; - struct device *dev = c2c_pmu->dev; - - if (c2c_pmu->c2c_type == C2C_TYPE_NVDLINK) { - c2c_pmu->peer_type = C2C_PEER_TYPE_CXLMEM; - - c2c_pmu->nr_inst = C2C_NR_INST_NVDLINK; - c2c_pmu->peer_insts[0][0] = (1UL << c2c_pmu->nr_inst) - 1; - - c2c_pmu->nr_peer = C2C_NR_PEER_CXLMEM; - c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; - - c2c_pmu->formats = nv_c2c_pmu_formats; - - return 0; - } - - c2c_pmu->nr_inst = (c2c_pmu->c2c_type == C2C_TYPE_NVLINK) ? - C2C_NR_INST_NVLINK : C2C_NR_INST_NVCLINK; - - if (device_property_read_u32(dev, "cpu_en_mask", &cpu_en)) - dev_dbg(dev, "no cpu_en_mask property\n"); - - if (cpu_en) { - c2c_pmu->peer_type = C2C_PEER_TYPE_CPU; - - /* Fill peer_insts bitmap with instances connected to peer CPU. */ - bitmap_from_arr32(c2c_pmu->peer_insts[0], &cpu_en, - c2c_pmu->nr_inst); - - c2c_pmu->nr_peer = 1; - c2c_pmu->formats = nv_c2c_pmu_formats; - } else { - u32 i; - u32 gpu_en = 0; - const char *props[C2C_NR_PEER_MAX] = { - "gpu0_en_mask", "gpu1_en_mask" - }; - - for (i = 0; i < C2C_NR_PEER_MAX; i++) { - if (device_property_read_u32(dev, props[i], &gpu_en)) - dev_dbg(dev, "no %s property\n", props[i]); - - if (gpu_en) { - /* Fill peer_insts bitmap with instances connected to peer GPU. */ - bitmap_from_arr32(c2c_pmu->peer_insts[i], &gpu_en, - c2c_pmu->nr_inst); - - c2c_pmu->nr_peer++; - } - } - - if (c2c_pmu->nr_peer == 0) { - dev_err(dev, "No GPU is enabled\n"); - return -EINVAL; - } - - c2c_pmu->peer_type = C2C_PEER_TYPE_GPU; - c2c_pmu->formats = nv_c2c_nvlink_pmu_formats; - } - - c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; - - return 0; -} - -static void *nv_c2c_pmu_init_pmu(struct platform_device *pdev) -{ - int ret; - struct nv_c2c_pmu *c2c_pmu; - struct acpi_device *acpi_dev; - struct device *dev = &pdev->dev; - - acpi_dev = ACPI_COMPANION(dev); - if (!acpi_dev) - return ERR_PTR(-ENODEV); - - c2c_pmu = devm_kzalloc(dev, sizeof(*c2c_pmu), GFP_KERNEL); - if (!c2c_pmu) - return ERR_PTR(-ENOMEM); - - c2c_pmu->dev = dev; - c2c_pmu->acpi_dev = acpi_dev; - c2c_pmu->c2c_type = (unsigned int)(unsigned long)device_get_match_data(dev); - platform_set_drvdata(pdev, c2c_pmu); - - ret = nv_c2c_pmu_init_socket(c2c_pmu); - if (ret) - goto done; - - ret = nv_c2c_pmu_init_id(c2c_pmu); - if (ret) - goto done; - - ret = nv_c2c_pmu_init_filter(c2c_pmu); - if (ret) - goto done; - -done: - if (ret) - return ERR_PTR(ret); - - return c2c_pmu; -} - -static int nv_c2c_pmu_init_mmio(struct nv_c2c_pmu *c2c_pmu) -{ - int i; - struct device *dev = c2c_pmu->dev; - struct platform_device *pdev = to_platform_device(dev); - - /* Map the address of all the instances. */ - for (i = 0; i < c2c_pmu->nr_inst; i++) { - c2c_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); - if (IS_ERR(c2c_pmu->base[i])) { - dev_err(dev, "Failed map address for instance %d\n", i); - return PTR_ERR(c2c_pmu->base[i]); - } - } - - /* Map broadcast address. */ - c2c_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, - c2c_pmu->nr_inst); - if (IS_ERR(c2c_pmu->base_broadcast)) { - dev_err(dev, "Failed map broadcast address\n"); - return PTR_ERR(c2c_pmu->base_broadcast); - } - - return 0; -} - -static int nv_c2c_pmu_register_pmu(struct nv_c2c_pmu *c2c_pmu) -{ - int ret; - - ret = cpuhp_state_add_instance(nv_c2c_pmu_cpuhp_state, - &c2c_pmu->cpuhp_node); - if (ret) { - dev_err(c2c_pmu->dev, "Error %d registering hotplug\n", ret); - return ret; - } - - c2c_pmu->pmu = (struct pmu) { - .parent = c2c_pmu->dev, - .task_ctx_nr = perf_invalid_context, - .pmu_enable = nv_c2c_pmu_enable, - .pmu_disable = nv_c2c_pmu_disable, - .event_init = nv_c2c_pmu_event_init, - .add = nv_c2c_pmu_add, - .del = nv_c2c_pmu_del, - .start = nv_c2c_pmu_start, - .stop = nv_c2c_pmu_stop, - .read = nv_c2c_pmu_read, - .attr_groups = c2c_pmu->attr_groups, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE | - PERF_PMU_CAP_NO_INTERRUPT, - }; - - ret = perf_pmu_register(&c2c_pmu->pmu, c2c_pmu->name, -1); - if (ret) { - dev_err(c2c_pmu->dev, "Failed to register C2C PMU: %d\n", ret); - cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, - &c2c_pmu->cpuhp_node); - return ret; - } - - return 0; -} - -static int nv_c2c_pmu_probe(struct platform_device *pdev) -{ - int ret; - struct nv_c2c_pmu *c2c_pmu; - - c2c_pmu = nv_c2c_pmu_init_pmu(pdev); - if (IS_ERR(c2c_pmu)) - return PTR_ERR(c2c_pmu); - - ret = nv_c2c_pmu_init_mmio(c2c_pmu); - if (ret) - return ret; - - ret = nv_c2c_pmu_get_cpus(c2c_pmu); - if (ret) - return ret; - - ret = nv_c2c_pmu_alloc_attr_groups(c2c_pmu); - if (ret) - return ret; - - ret = nv_c2c_pmu_register_pmu(c2c_pmu); - if (ret) - return ret; - - dev_dbg(c2c_pmu->dev, "Registered %s PMU\n", c2c_pmu->name); - - return 0; -} - -static void nv_c2c_pmu_device_remove(struct platform_device *pdev) -{ - struct nv_c2c_pmu *c2c_pmu = platform_get_drvdata(pdev); - - perf_pmu_unregister(&c2c_pmu->pmu); - cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, &c2c_pmu->cpuhp_node); -} - -static const struct acpi_device_id nv_c2c_pmu_acpi_match[] = { - { "NVDA2023", (kernel_ulong_t)C2C_TYPE_NVLINK }, - { "NVDA2022", (kernel_ulong_t)C2C_TYPE_NVCLINK }, - { "NVDA2020", (kernel_ulong_t)C2C_TYPE_NVDLINK }, - { } -}; -MODULE_DEVICE_TABLE(acpi, nv_c2c_pmu_acpi_match); - -static struct platform_driver nv_c2c_pmu_driver = { - .driver = { - .name = "nvidia-t410-c2c-pmu", - .acpi_match_table = ACPI_PTR(nv_c2c_pmu_acpi_match), - .suppress_bind_attrs = true, - }, - .probe = nv_c2c_pmu_probe, - .remove = nv_c2c_pmu_device_remove, -}; - -static int __init nv_c2c_pmu_init(void) -{ - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "perf/nvidia/c2c:online", - nv_c2c_pmu_online_cpu, - nv_c2c_pmu_cpu_teardown); - if (ret < 0) - return ret; - - nv_c2c_pmu_cpuhp_state = ret; - return platform_driver_register(&nv_c2c_pmu_driver); -} - -static void __exit nv_c2c_pmu_exit(void) -{ - platform_driver_unregister(&nv_c2c_pmu_driver); - cpuhp_remove_multi_state(nv_c2c_pmu_cpuhp_state); -} - -module_init(nv_c2c_pmu_init); -module_exit(nv_c2c_pmu_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("NVIDIA Tegra410 C2C PMU driver"); -MODULE_AUTHOR("Besar Wicaksono "); From 54c6101c74df5efb5ebdcaf5469573034cc5e364 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:25:38 -0700 Subject: [PATCH 03/17] Revert "NVIDIA: VR: SAUCE: perf: add NVIDIA Tegra410 CPU Memory Latency PMU" This reverts commit eff2e93346d1a67c66ef96abdf6263c5bba111f5. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 25 - drivers/perf/Kconfig | 7 - drivers/perf/Makefile | 1 - drivers/perf/nvidia_t410_cmem_latency_pmu.c | 727 ------------------ 4 files changed, 760 deletions(-) delete mode 100644 drivers/perf/nvidia_t410_cmem_latency_pmu.c diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 11fc1c88346a0..07dc447eead7c 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -8,7 +8,6 @@ metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) * PCIE * PCIE-TGT -* CPU Memory (CMEM) Latency PMU Driver ---------- @@ -343,27 +342,3 @@ Example usage: 0x10000 to 0x100FF on socket 0's PCIE RC-1:: perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ - -CPU Memory (CMEM) Latency PMU ------------------------------ - -This PMU monitors latency events of memory read requests to local -CPU DRAM: - - * RD_REQ counters: count read requests (32B per request). - * RD_CUM_OUTS counters: accumulated outstanding request counter, which track - how many cycles the read requests are in flight. - * CYCLES counter: counts the number of elapsed cycles. - -The average latency is calculated as:: - - FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS - AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ - AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ - -The events and configuration options of this PMU device are described in sysfs, -see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. - -Example usage:: - - perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 9fed3c41d5ea0..638321fc9800c 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -311,11 +311,4 @@ config MARVELL_PEM_PMU Enable support for PCIe Interface performance monitoring on Marvell platform. -config NVIDIA_TEGRA410_CMEM_LATENCY_PMU - tristate "NVIDIA Tegra410 CPU Memory Latency PMU" - depends on ARM64 - help - Enable perf support for CPU memory latency counters monitoring on - NVIDIA Tegra410 SoC. - endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 4aa6aad393c2d..ea52711a87e32 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -35,4 +35,3 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o -obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c deleted file mode 100644 index 9b466581c8fcc..0000000000000 --- a/drivers/perf/nvidia_t410_cmem_latency_pmu.c +++ /dev/null @@ -1,727 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver. - * - * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_INSTANCES 14 -#define BCAST(pmu) pmu->base[NUM_INSTANCES] - -/* Register offsets. */ -#define CG_CTRL 0x800 -#define CTRL 0x808 -#define STATUS 0x810 -#define CYCLE_CNTR 0x818 -#define MC0_REQ_CNTR 0x820 -#define MC0_AOR_CNTR 0x830 -#define MC1_REQ_CNTR 0x838 -#define MC1_AOR_CNTR 0x848 -#define MC2_REQ_CNTR 0x850 -#define MC2_AOR_CNTR 0x860 - -/* CTRL values. */ -#define CTRL_DISABLE 0x0ULL -#define CTRL_ENABLE 0x1ULL -#define CTRL_CLR 0x2ULL - -/* CG_CTRL values. */ -#define CG_CTRL_DISABLE 0x0ULL -#define CG_CTRL_ENABLE 0x1ULL - -/* STATUS register field. */ -#define STATUS_CYCLE_OVF BIT(0) -#define STATUS_MC0_AOR_OVF BIT(1) -#define STATUS_MC0_REQ_OVF BIT(3) -#define STATUS_MC1_AOR_OVF BIT(4) -#define STATUS_MC1_REQ_OVF BIT(6) -#define STATUS_MC2_AOR_OVF BIT(7) -#define STATUS_MC2_REQ_OVF BIT(9) - -/* Events. */ -#define EVENT_CYCLES 0x0 -#define EVENT_REQ 0x1 -#define EVENT_AOR 0x2 - -#define NUM_EVENTS 0x3 -#define MASK_EVENT 0x3 -#define MAX_ACTIVE_EVENTS 32 - -#define ACTIVE_CPU_MASK 0x0 -#define ASSOCIATED_CPU_MASK 0x1 - -static unsigned long cmem_lat_pmu_cpuhp_state; - -struct cmem_lat_pmu_hw_events { - struct perf_event *events[MAX_ACTIVE_EVENTS]; - DECLARE_BITMAP(used_ctrs, MAX_ACTIVE_EVENTS); -}; - -struct cmem_lat_pmu { - struct pmu pmu; - struct device *dev; - const char *name; - const char *identifier; - void __iomem *base[NUM_INSTANCES + 1]; - cpumask_t associated_cpus; - cpumask_t active_cpu; - struct hlist_node node; - struct cmem_lat_pmu_hw_events hw_events; -}; - -#define to_cmem_lat_pmu(p) \ - container_of(p, struct cmem_lat_pmu, pmu) - - -/* Get event type from perf_event. */ -static inline u32 get_event_type(struct perf_event *event) -{ - return (event->attr.config) & MASK_EVENT; -} - -/* PMU operations. */ -static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events, - struct perf_event *event) -{ - unsigned int idx; - - idx = find_first_zero_bit(hw_events->used_ctrs, MAX_ACTIVE_EVENTS); - if (idx >= MAX_ACTIVE_EVENTS) - return -EAGAIN; - - set_bit(idx, hw_events->used_ctrs); - - return idx; -} - -static bool cmem_lat_pmu_validate_event(struct pmu *pmu, - struct cmem_lat_pmu_hw_events *hw_events, - struct perf_event *event) -{ - if (is_software_event(event)) - return true; - - /* Reject groups spanning multiple HW PMUs. */ - if (event->pmu != pmu) - return false; - - return (cmem_lat_pmu_get_event_idx(hw_events, event) >= 0); -} - -/* - * Make sure the group of events can be scheduled at once - * on the PMU. - */ -static bool cmem_lat_pmu_validate_group(struct perf_event *event) -{ - struct perf_event *sibling, *leader = event->group_leader; - struct cmem_lat_pmu_hw_events fake_hw_events; - - if (event->group_leader == event) - return true; - - memset(&fake_hw_events, 0, sizeof(fake_hw_events)); - - if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader)) - return false; - - for_each_sibling_event(sibling, leader) { - if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, - sibling)) - return false; - } - - return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event); -} - -static int cmem_lat_pmu_event_init(struct perf_event *event) -{ - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - u32 event_type = get_event_type(event); - - if (event->attr.type != event->pmu->type || - event_type >= NUM_EVENTS) - return -ENOENT; - - /* - * Following other "uncore" PMUs, we do not support sampling mode or - * attach to a task (per-process mode). - */ - if (is_sampling_event(event)) { - dev_dbg(cmem_lat_pmu->pmu.dev, - "Can't support sampling events\n"); - return -EOPNOTSUPP; - } - - if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) { - dev_dbg(cmem_lat_pmu->pmu.dev, - "Can't support per-task counters\n"); - return -EINVAL; - } - - /* - * Make sure the CPU assignment is on one of the CPUs associated with - * this PMU. - */ - if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) { - dev_dbg(cmem_lat_pmu->pmu.dev, - "Requested cpu is not associated with the PMU\n"); - return -EINVAL; - } - - /* Enforce the current active CPU to handle the events in this PMU. */ - event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu); - if (event->cpu >= nr_cpu_ids) - return -EINVAL; - - if (!cmem_lat_pmu_validate_group(event)) - return -EINVAL; - - hwc->idx = -1; - hwc->config = event_type; - - return 0; -} - -static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu, - unsigned int inst) -{ - return readq(cmem_lat_pmu->base[inst] + STATUS); -} - -static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event) -{ - const unsigned int instance = 0; - u64 status; - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); - struct device *dev = cmem_lat_pmu->dev; - - /* - * Use the reading from first instance since all instances are - * identical. - */ - status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance); - if (status & STATUS_CYCLE_OVF) - dev_warn(dev, "Cycle counter overflow\n"); - - return readq(cmem_lat_pmu->base[instance] + CYCLE_CNTR); -} - -static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event) -{ - unsigned int i; - u64 status, val = 0; - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); - struct device *dev = cmem_lat_pmu->dev; - - /* Sum up the counts from all instances. */ - for (i = 0; i < NUM_INSTANCES; i++) { - status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); - if (status & STATUS_MC0_REQ_OVF) - dev_warn(dev, "MC0 request counter overflow\n"); - if (status & STATUS_MC1_REQ_OVF) - dev_warn(dev, "MC1 request counter overflow\n"); - if (status & STATUS_MC2_REQ_OVF) - dev_warn(dev, "MC2 request counter overflow\n"); - - val += readq(cmem_lat_pmu->base[i] + MC0_REQ_CNTR); - val += readq(cmem_lat_pmu->base[i] + MC1_REQ_CNTR); - val += readq(cmem_lat_pmu->base[i] + MC2_REQ_CNTR); - } - - return val; -} - -static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event) -{ - unsigned int i; - u64 status, val = 0; - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); - struct device *dev = cmem_lat_pmu->dev; - - /* Sum up the counts from all instances. */ - for (i = 0; i < NUM_INSTANCES; i++) { - status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); - if (status & STATUS_MC0_AOR_OVF) - dev_warn(dev, "MC0 AOR counter overflow\n"); - if (status & STATUS_MC1_AOR_OVF) - dev_warn(dev, "MC1 AOR counter overflow\n"); - if (status & STATUS_MC2_AOR_OVF) - dev_warn(dev, "MC2 AOR counter overflow\n"); - - val += readq(cmem_lat_pmu->base[i] + MC0_AOR_CNTR); - val += readq(cmem_lat_pmu->base[i] + MC1_AOR_CNTR); - val += readq(cmem_lat_pmu->base[i] + MC2_AOR_CNTR); - } - - return val; -} - -static u64 (*read_counter_fn[NUM_EVENTS])(struct perf_event *) = { - [EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter, - [EVENT_REQ] = cmem_lat_pmu_read_req_counter, - [EVENT_AOR] = cmem_lat_pmu_read_aor_counter, -}; - -static void cmem_lat_pmu_event_update(struct perf_event *event) -{ - u32 event_type; - u64 prev, now; - struct hw_perf_event *hwc = &event->hw; - - if (hwc->state & PERF_HES_STOPPED) - return; - - event_type = hwc->config; - - do { - prev = local64_read(&hwc->prev_count); - now = read_counter_fn[event_type](event); - } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); - - local64_add(now - prev, &event->count); - - hwc->state |= PERF_HES_UPTODATE; -} - -static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags) -{ - event->hw.state = 0; -} - -static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags) -{ - event->hw.state |= PERF_HES_STOPPED; -} - -static int cmem_lat_pmu_add(struct perf_event *event, int flags) -{ - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); - struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; - struct hw_perf_event *hwc = &event->hw; - int idx; - - if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), - &cmem_lat_pmu->associated_cpus))) - return -ENOENT; - - idx = cmem_lat_pmu_get_event_idx(hw_events, event); - if (idx < 0) - return idx; - - hw_events->events[idx] = event; - hwc->idx = idx; - hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; - - if (flags & PERF_EF_START) - cmem_lat_pmu_start(event, PERF_EF_RELOAD); - - /* Propagate changes to the userspace mapping. */ - perf_event_update_userpage(event); - - return 0; -} - -static void cmem_lat_pmu_del(struct perf_event *event, int flags) -{ - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); - struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - cmem_lat_pmu_stop(event, PERF_EF_UPDATE); - - hw_events->events[idx] = NULL; - - clear_bit(idx, hw_events->used_ctrs); - - perf_event_update_userpage(event); -} - -static void cmem_lat_pmu_read(struct perf_event *event) -{ - cmem_lat_pmu_event_update(event); -} - -static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val) -{ - writeq(val, BCAST(cmem_lat_pmu) + CG_CTRL); -} - -static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val) -{ - writeq(val, BCAST(cmem_lat_pmu) + CTRL); -} - -static void cmem_lat_pmu_enable(struct pmu *pmu) -{ - bool disabled; - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); - - disabled = bitmap_empty( - cmem_lat_pmu->hw_events.used_ctrs, MAX_ACTIVE_EVENTS); - - if (disabled) - return; - - /* Enable all the counters. */ - cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE); - cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_ENABLE); -} - -static void cmem_lat_pmu_disable(struct pmu *pmu) -{ - int idx; - struct perf_event *event; - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); - - /* Disable all the counters. */ - cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_DISABLE); - - /* - * The counters will start from 0 again on restart. - * Update the events immediately to avoid losing the counts. - */ - for_each_set_bit( - idx, cmem_lat_pmu->hw_events.used_ctrs, MAX_ACTIVE_EVENTS) { - event = cmem_lat_pmu->hw_events.events[idx]; - - if (!event) - continue; - - cmem_lat_pmu_event_update(event); - - local64_set(&event->hw.prev_count, 0ULL); - } - - cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR); - cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE); -} - -/* PMU identifier attribute. */ - -static ssize_t cmem_lat_pmu_identifier_show(struct device *dev, - struct device_attribute *attr, - char *page) -{ - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev)); - - return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier); -} - -static struct device_attribute cmem_lat_pmu_identifier_attr = - __ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL); - -static struct attribute *cmem_lat_pmu_identifier_attrs[] = { - &cmem_lat_pmu_identifier_attr.attr, - NULL, -}; - -static struct attribute_group cmem_lat_pmu_identifier_attr_group = { - .attrs = cmem_lat_pmu_identifier_attrs, -}; - -/* Format attributes. */ - -#define NV_PMU_EXT_ATTR(_name, _func, _config) \ - (&((struct dev_ext_attribute[]){ \ - { \ - .attr = __ATTR(_name, 0444, _func, NULL), \ - .var = (void *)_config \ - } \ - })[0].attr.attr) - -static struct attribute *cmem_lat_pmu_formats[] = { - NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"), - NULL, -}; - -static const struct attribute_group cmem_lat_pmu_format_group = { - .name = "format", - .attrs = cmem_lat_pmu_formats, -}; - -/* Event attributes. */ - -static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct perf_pmu_events_attr *pmu_attr; - - pmu_attr = container_of(attr, typeof(*pmu_attr), attr); - return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); -} - -#define NV_PMU_EVENT_ATTR(_name, _config) \ - PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config) - -static struct attribute *cmem_lat_pmu_events[] = { - NV_PMU_EVENT_ATTR(cycles, EVENT_CYCLES), - NV_PMU_EVENT_ATTR(rd_req, EVENT_REQ), - NV_PMU_EVENT_ATTR(rd_cum_outs, EVENT_AOR), - NULL -}; - -static const struct attribute_group cmem_lat_pmu_events_group = { - .name = "events", - .attrs = cmem_lat_pmu_events, -}; - -/* Cpumask attributes. */ - -static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pmu *pmu = dev_get_drvdata(dev); - struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); - struct dev_ext_attribute *eattr = - container_of(attr, struct dev_ext_attribute, attr); - unsigned long mask_id = (unsigned long)eattr->var; - const cpumask_t *cpumask; - - switch (mask_id) { - case ACTIVE_CPU_MASK: - cpumask = &cmem_lat_pmu->active_cpu; - break; - case ASSOCIATED_CPU_MASK: - cpumask = &cmem_lat_pmu->associated_cpus; - break; - default: - return 0; - } - return cpumap_print_to_pagebuf(true, buf, cpumask); -} - -#define NV_PMU_CPUMASK_ATTR(_name, _config) \ - NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show, \ - (unsigned long)_config) - -static struct attribute *cmem_lat_pmu_cpumask_attrs[] = { - NV_PMU_CPUMASK_ATTR(cpumask, ACTIVE_CPU_MASK), - NV_PMU_CPUMASK_ATTR(associated_cpus, ASSOCIATED_CPU_MASK), - NULL, -}; - -static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = { - .attrs = cmem_lat_pmu_cpumask_attrs, -}; - -/* Per PMU device attribute groups. */ - -static const struct attribute_group *cmem_lat_pmu_attr_groups[] = { - &cmem_lat_pmu_identifier_attr_group, - &cmem_lat_pmu_format_group, - &cmem_lat_pmu_events_group, - &cmem_lat_pmu_cpumask_attr_group, - NULL, -}; - -static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct cmem_lat_pmu *cmem_lat_pmu = - hlist_entry_safe(node, struct cmem_lat_pmu, node); - - if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus)) - return 0; - - /* If the PMU is already managed, there is nothing to do */ - if (!cpumask_empty(&cmem_lat_pmu->active_cpu)) - return 0; - - /* Use this CPU for event counting */ - cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu); - - return 0; -} - -static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) -{ - unsigned int dst; - - struct cmem_lat_pmu *cmem_lat_pmu = - hlist_entry_safe(node, struct cmem_lat_pmu, node); - - /* Nothing to do if this CPU doesn't own the PMU */ - if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu)) - return 0; - - /* Choose a new CPU to migrate ownership of the PMU to */ - dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus, - cpu_online_mask, cpu); - if (dst >= nr_cpu_ids) - return 0; - - /* Use this CPU for event counting */ - perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst); - cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu); - - return 0; -} - -static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu, - unsigned int socket) -{ - int ret = 0, cpu; - - for_each_possible_cpu(cpu) { - if (cpu_to_node(cpu) == socket) - cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus); - } - - if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) { - dev_dbg(cmem_lat_pmu->dev, - "No cpu associated with PMU socket-%u\n", socket); - ret = -ENODEV; - } - - return ret; -} - -static int cmem_lat_pmu_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - struct acpi_device *acpi_dev; - struct cmem_lat_pmu *cmem_lat_pmu; - char *name, *uid_str; - int ret, i; - u32 socket; - - acpi_dev = ACPI_COMPANION(dev); - if (!acpi_dev) - return -ENODEV; - - uid_str = acpi_device_uid(acpi_dev); - if (!uid_str) - return -ENODEV; - - ret = kstrtou32(uid_str, 0, &socket); - if (ret) - return ret; - - cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL); - name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket); - if (!cmem_lat_pmu || !name) - return -ENOMEM; - - cmem_lat_pmu->dev = dev; - cmem_lat_pmu->name = name; - cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev); - platform_set_drvdata(pdev, cmem_lat_pmu); - - cmem_lat_pmu->pmu = (struct pmu) { - .parent = &pdev->dev, - .task_ctx_nr = perf_invalid_context, - .pmu_enable = cmem_lat_pmu_enable, - .pmu_disable = cmem_lat_pmu_disable, - .event_init = cmem_lat_pmu_event_init, - .add = cmem_lat_pmu_add, - .del = cmem_lat_pmu_del, - .start = cmem_lat_pmu_start, - .stop = cmem_lat_pmu_stop, - .read = cmem_lat_pmu_read, - .attr_groups = cmem_lat_pmu_attr_groups, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE | - PERF_PMU_CAP_NO_INTERRUPT, - }; - - /* Map the address of all the instances plus one for the broadcast. */ - for (i = 0; i < NUM_INSTANCES + 1; i++) { - cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); - if (IS_ERR(cmem_lat_pmu->base[i])) { - dev_err(dev, "Failed map address for instance %d\n", i); - return PTR_ERR(cmem_lat_pmu->base[i]); - } - } - - ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket); - if (ret) - return ret; - - ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state, - &cmem_lat_pmu->node); - if (ret) { - dev_err(&pdev->dev, "Error %d registering hotplug\n", ret); - return ret; - } - - cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE); - cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR); - cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE); - - ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1); - if (ret) { - dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret); - cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, - &cmem_lat_pmu->node); - return ret; - } - - dev_dbg(&pdev->dev, "Registered %s PMU\n", name); - - return 0; -} - -static void cmem_lat_pmu_device_remove(struct platform_device *pdev) -{ - struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev); - - perf_pmu_unregister(&cmem_lat_pmu->pmu); - cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, - &cmem_lat_pmu->node); -} - -static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = { - { "NVDA2021", }, - { } -}; -MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match); - -static struct platform_driver cmem_lat_pmu_driver = { - .driver = { - .name = "nvidia-t410-cmem-latency-pmu", - .acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match), - .suppress_bind_attrs = true, - }, - .probe = cmem_lat_pmu_probe, - .remove = cmem_lat_pmu_device_remove, -}; - -static int __init cmem_lat_pmu_init(void) -{ - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "perf/nvidia/cmem_latency:online", - cmem_lat_pmu_cpu_online, - cmem_lat_pmu_cpu_teardown); - if (ret < 0) - return ret; - - cmem_lat_pmu_cpuhp_state = ret; - - return platform_driver_register(&cmem_lat_pmu_driver); -} - -static void __exit cmem_lat_pmu_exit(void) -{ - platform_driver_unregister(&cmem_lat_pmu_driver); - cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state); -} - -module_init(cmem_lat_pmu_init); -module_exit(cmem_lat_pmu_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver"); -MODULE_AUTHOR("Besar Wicaksono "); From d007bc52c2950ba37df0e7574085a28f3012ca10 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:25:44 -0700 Subject: [PATCH 04/17] Revert "NVIDIA: VR: SAUCE: perf/arm_cspmu: nvidia: Add Tegra410 PCIE-TGT PMU" This reverts commit ba06e256db01b5c4135d062bc163c78c6256f105. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 76 ---- drivers/perf/arm_cspmu/nvidia_cspmu.c | 324 ------------------ 2 files changed, 400 deletions(-) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 07dc447eead7c..8528685ddb61e 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -7,7 +7,6 @@ metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) * PCIE -* PCIE-TGT PMU Driver ---------- @@ -212,11 +211,6 @@ Example usage: perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ -.. _NVIDIA_T410_PCIE_PMU_RC_Mapping_Section: - -Mapping the RC# to lspci segment number -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register @@ -272,73 +266,3 @@ Example output:: 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 - -PCIE-TGT PMU ------------- - -The PCIE-TGT PMU monitors traffic targeting PCIE BAR and CXL HDM ranges. -There is one PCIE-TGT PMU per PCIE root complex (RC) in the SoC. Each RC in -Tegra410 SoC can have up to 16 lanes that can be bifurcated into up to 8 root -ports (RP). The PMU provides RP filter to count PCIE BAR traffic to each RP and -address filter to count access to PCIE BAR or CXL HDM ranges. The details -of the filters are described in the following sections. - -Mapping the RC# to lspci segment number is similar to the PCIE PMU. -Please see :ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section` for more info. - -The events and configuration options of this PMU device are available in sysfs, -see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_. - -The events in this PMU can be used to measure bandwidth and utilization: - - * rd_req: count the number of read requests to PCIE. - * wr_req: count the number of write requests to PCIE. - * rd_bytes: count the number of bytes transferred by rd_req. - * wr_bytes: count the number of bytes transferred by wr_req. - * cycles: counts the PCIE cycles. - -The average bandwidth is calculated as:: - - AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS - AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS - -The average request rate is calculated as:: - - AVG_RD_REQUEST_RATE = RD_REQ / CYCLES - AVG_WR_REQUEST_RATE = WR_REQ / CYCLES - -The PMU events can be filtered based on the destination root port or target -address range. Filtering based on RP is only available for PCIE BAR traffic. -Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be -found in sysfs, see -/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_/format/. - -Destination filter settings: - -* dst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF" - corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is - only available for PCIE BAR traffic. -* dst_addr_base: BAR or CXL HDM filter base address. -* dst_addr_mask: BAR or CXL HDM filter address mask. -* dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the - address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter - the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison - to determine if the traffic destination address falls within the filter range:: - - (txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask) - - If the comparison succeeds, then the event will be counted. - -If the destination filter is not specified, the RP filter will be configured by default -to count PCIE BAR traffic to all root ports. - -Example usage: - -* Count event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0:: - - perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/ - -* Count event id 0x1 for accesses to PCIE BAR or CXL HDM address range - 0x10000 to 0x100FF on socket 0's PCIE RC-1:: - - perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 095d2f322c6f9..3a5531d1f94c7 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -42,24 +42,6 @@ #define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) #define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST -#define NV_PCIE_TGT_PORT_COUNT 8ULL -#define NV_PCIE_TGT_EV_TYPE_CC 0x4 -#define NV_PCIE_TGT_EV_TYPE_COUNT 3ULL -#define NV_PCIE_TGT_EV_TYPE_MASK GENMASK_ULL(NV_PCIE_TGT_EV_TYPE_COUNT - 1, 0) -#define NV_PCIE_TGT_FILTER2_MASK GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT, 0) -#define NV_PCIE_TGT_FILTER2_PORT GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT - 1, 0) -#define NV_PCIE_TGT_FILTER2_ADDR_EN BIT(NV_PCIE_TGT_PORT_COUNT) -#define NV_PCIE_TGT_FILTER2_ADDR GENMASK_ULL(15, NV_PCIE_TGT_PORT_COUNT) -#define NV_PCIE_TGT_FILTER2_DEFAULT NV_PCIE_TGT_FILTER2_PORT - -#define NV_PCIE_TGT_ADDR_COUNT 8ULL -#define NV_PCIE_TGT_ADDR_STRIDE 20 -#define NV_PCIE_TGT_ADDR_CTRL 0xD38 -#define NV_PCIE_TGT_ADDR_BASE_LO 0xD3C -#define NV_PCIE_TGT_ADDR_BASE_HI 0xD40 -#define NV_PCIE_TGT_ADDR_MASK_LO 0xD44 -#define NV_PCIE_TGT_ADDR_MASK_HI 0xD48 - #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -204,15 +186,6 @@ static struct attribute *pcie_v2_pmu_event_attrs[] = { NULL, }; -static struct attribute *pcie_tgt_pmu_event_attrs[] = { - ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), - ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), - ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), - ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), - ARM_CSPMU_EVENT_ATTR(cycles, NV_PCIE_TGT_EV_TYPE_CC), - NULL, -}; - static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -266,15 +239,6 @@ static struct attribute *pcie_v2_pmu_format_attrs[] = { NULL, }; -static struct attribute *pcie_tgt_pmu_format_attrs[] = { - ARM_CSPMU_FORMAT_ATTR(event, "config:0-2"), - ARM_CSPMU_FORMAT_ATTR(dst_rp_mask, "config:3-10"), - ARM_CSPMU_FORMAT_ATTR(dst_addr_en, "config:11"), - ARM_CSPMU_FORMAT_ATTR(dst_addr_base, "config1:0-63"), - ARM_CSPMU_FORMAT_ATTR(dst_addr_mask, "config2:0-63"), - NULL, -}; - static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -514,268 +478,6 @@ static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, return 0; } -struct pcie_tgt_addr_filter { - u32 refcount; - u64 base; - u64 mask; -}; - -struct pcie_tgt_data { - struct pcie_tgt_addr_filter addr_filter[NV_PCIE_TGT_ADDR_COUNT]; - void __iomem *addr_filter_reg; -}; - -#if defined(CONFIG_ACPI) -static int pcie_tgt_init_data(struct arm_cspmu *cspmu) -{ - int ret; - struct acpi_device *adev; - struct pcie_tgt_data *data; - struct list_head resource_list; - struct resource_entry *rentry; - struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); - struct device *dev = cspmu->dev; - - data = devm_kzalloc(dev, sizeof(struct pcie_tgt_data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - adev = arm_cspmu_acpi_dev_get(cspmu); - if (!adev) { - dev_err(dev, "failed to get associated PCIE-TGT device\n"); - return -ENODEV; - } - - INIT_LIST_HEAD(&resource_list); - ret = acpi_dev_get_memory_resources(adev, &resource_list); - if (ret < 0) { - dev_err(dev, "failed to get PCIE-TGT device memory resources\n"); - acpi_dev_put(adev); - return ret; - } - - rentry = list_first_entry_or_null( - &resource_list, struct resource_entry, node); - if (rentry) { - data->addr_filter_reg = devm_ioremap_resource(dev, rentry->res); - ret = 0; - } - - if (IS_ERR(data->addr_filter_reg)) { - dev_err(dev, "failed to get address filter resource\n"); - ret = PTR_ERR(data->addr_filter_reg); - } - - acpi_dev_free_resource_list(&resource_list); - acpi_dev_put(adev); - - ctx->data = data; - - return ret; -} -#else -static int pcie_tgt_init_data(struct arm_cspmu *cspmu) -{ - return -ENODEV; -} -#endif - -static struct pcie_tgt_data *pcie_tgt_get_data(struct arm_cspmu *cspmu) -{ - struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); - - return ctx->data; -} - -/* Find the first available address filter slot. */ -static int pcie_tgt_find_addr_idx(struct arm_cspmu *cspmu, u64 base, u64 mask, - bool is_reset) -{ - int i; - struct pcie_tgt_data *data = pcie_tgt_get_data(cspmu); - - for (i = 0; i < NV_PCIE_TGT_ADDR_COUNT; i++) { - if (!is_reset && data->addr_filter[i].refcount == 0) - return i; - - if (data->addr_filter[i].base == base && - data->addr_filter[i].mask == mask) - return i; - } - - return -ENODEV; -} - -static u32 pcie_tgt_pmu_event_filter(const struct perf_event *event) -{ - u32 filter; - - filter = (event->attr.config >> NV_PCIE_TGT_EV_TYPE_COUNT) & - NV_PCIE_TGT_FILTER2_MASK; - - return filter; -} - -static bool pcie_tgt_pmu_addr_en(const struct perf_event *event) -{ - u32 filter = pcie_tgt_pmu_event_filter(event); - - return FIELD_GET(NV_PCIE_TGT_FILTER2_ADDR_EN, filter) != 0; -} - -static u32 pcie_tgt_pmu_port_filter(const struct perf_event *event) -{ - u32 filter = pcie_tgt_pmu_event_filter(event); - - return FIELD_GET(NV_PCIE_TGT_FILTER2_PORT, filter); -} - -static u64 pcie_tgt_pmu_dst_addr_base(const struct perf_event *event) -{ - return event->attr.config1; -} - -static u64 pcie_tgt_pmu_dst_addr_mask(const struct perf_event *event) -{ - return event->attr.config2; -} - -static int pcie_tgt_pmu_validate_event(struct arm_cspmu *cspmu, - struct perf_event *new_ev) -{ - u64 base, mask; - int idx; - - if (!pcie_tgt_pmu_addr_en(new_ev)) - return 0; - - /* Make sure there is a slot available for the address filter. */ - base = pcie_tgt_pmu_dst_addr_base(new_ev); - mask = pcie_tgt_pmu_dst_addr_mask(new_ev); - idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); - if (idx < 0) - return -EINVAL; - - return 0; -} - -static void pcie_tgt_pmu_config_addr_filter(struct arm_cspmu *cspmu, - bool en, u64 base, u64 mask, int idx) -{ - struct pcie_tgt_data *data; - struct pcie_tgt_addr_filter *filter; - void __iomem *filter_reg; - - data = pcie_tgt_get_data(cspmu); - filter = &data->addr_filter[idx]; - filter_reg = data->addr_filter_reg + (idx * NV_PCIE_TGT_ADDR_STRIDE); - - if (en) { - filter->refcount++; - if (filter->refcount == 1) { - filter->base = base; - filter->mask = mask; - - writel(lower_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); - writel(upper_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); - writel(lower_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); - writel(upper_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); - writel(1, filter_reg + NV_PCIE_TGT_ADDR_CTRL); - } - } else { - filter->refcount--; - if (filter->refcount == 0) { - writel(0, filter_reg + NV_PCIE_TGT_ADDR_CTRL); - writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); - writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); - writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); - writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); - - filter->base = 0; - filter->mask = 0; - } - } -} - -static void pcie_tgt_pmu_set_ev_filter(struct arm_cspmu *cspmu, - const struct perf_event *event) -{ - bool addr_filter_en; - int idx; - u32 filter2_val, filter2_offset, port_filter; - u64 base, mask; - - filter2_val = 0; - filter2_offset = PMEVFILT2R + (4 * event->hw.idx); - - addr_filter_en = pcie_tgt_pmu_addr_en(event); - if (addr_filter_en) { - base = pcie_tgt_pmu_dst_addr_base(event); - mask = pcie_tgt_pmu_dst_addr_mask(event); - idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); - - if (idx < 0) { - dev_err(cspmu->dev, - "Unable to find a slot for address filtering\n"); - writel(0, cspmu->base0 + filter2_offset); - return; - } - - /* Configure address range filter registers.*/ - pcie_tgt_pmu_config_addr_filter(cspmu, true, base, mask, idx); - - /* Config the counter to use the selected address filter slot. */ - filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_ADDR, 1U << idx); - } - - port_filter = pcie_tgt_pmu_port_filter(event); - - /* Monitor all ports if no filter is selected. */ - if (!addr_filter_en && port_filter == 0) - port_filter = NV_PCIE_TGT_FILTER2_PORT; - - filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_PORT, port_filter); - - writel(filter2_val, cspmu->base0 + filter2_offset); -} - -static void pcie_tgt_pmu_reset_ev_filter(struct arm_cspmu *cspmu, - const struct perf_event *event) -{ - bool addr_filter_en; - u64 base, mask; - int idx; - - addr_filter_en = pcie_tgt_pmu_addr_en(event); - if (!addr_filter_en) - return; - - base = pcie_tgt_pmu_dst_addr_base(event); - mask = pcie_tgt_pmu_dst_addr_mask(event); - idx = pcie_tgt_find_addr_idx(cspmu, base, mask, true); - - if (idx < 0) { - dev_err(cspmu->dev, - "Unable to find the address filter slot to reset\n"); - return; - } - - pcie_tgt_pmu_config_addr_filter( - cspmu, false, base, mask, idx); -} - -static u32 pcie_tgt_pmu_event_type(const struct perf_event *event) -{ - return event->attr.config & NV_PCIE_TGT_EV_TYPE_MASK; -} - -static bool pcie_tgt_pmu_is_cycle_counter_event(const struct perf_event *event) -{ - u32 event_type = pcie_tgt_pmu_event_type(event); - - return event_type == NV_PCIE_TGT_EV_TYPE_CC; -} - enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, NAME_FMT_SOCKET, @@ -920,30 +622,6 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .reset_ev_filter = nv_cspmu_reset_ev_filter, } }, - { - .prodid = 0x10700000, - .prodid_mask = NV_PRODID_MASK, - .name_pattern = "nvidia_pcie_tgt_pmu_%u_rc_%u", - .name_fmt = NAME_FMT_SOCKET_INST, - .template_ctx = { - .event_attr = pcie_tgt_pmu_event_attrs, - .format_attr = pcie_tgt_pmu_format_attrs, - .filter_mask = 0x0, - .filter_default_val = 0x0, - .filter2_mask = NV_PCIE_TGT_FILTER2_MASK, - .filter2_default_val = NV_PCIE_TGT_FILTER2_DEFAULT, - .get_filter = NULL, - .get_filter2 = NULL, - .init_data = pcie_tgt_init_data - }, - .ops = { - .is_cycle_counter_event = pcie_tgt_pmu_is_cycle_counter_event, - .event_type = pcie_tgt_pmu_event_type, - .validate_event = pcie_tgt_pmu_validate_event, - .set_ev_filter = pcie_tgt_pmu_set_ev_filter, - .reset_ev_filter = pcie_tgt_pmu_reset_ev_filter, - } - }, { .prodid = 0, .prodid_mask = 0, @@ -1039,8 +717,6 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) /* NVIDIA specific callbacks. */ SET_OP(validate_event, impl_ops, match, NULL); - SET_OP(event_type, impl_ops, match, NULL); - SET_OP(is_cycle_counter_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); SET_OP(reset_ev_filter, impl_ops, match, NULL); From 07a7992713f3b393fb32b374d29f49e3576c87ee Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:25:48 -0700 Subject: [PATCH 05/17] Revert "NVIDIA: VR: SAUCE: perf/arm_cspmu: nvidia: Add Tegra410 PCIE PMU" This reverts commit 6984fc5cbdb379ca9d4239e0b573b845a7a69b95. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 162 -------------- drivers/perf/arm_cspmu/nvidia_cspmu.c | 208 +----------------- 2 files changed, 2 insertions(+), 368 deletions(-) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 8528685ddb61e..7b7ba5700ca19 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -6,7 +6,6 @@ The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) -* PCIE PMU Driver ---------- @@ -105,164 +104,3 @@ Example usage: destination filter = remote memory:: perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ - -PCIE PMU --------- - -This PMU monitors all read/write traffic from the root port(s) or a particular -BDF in a PCIE root complex (RC) to local or remote memory. There is one PMU per -PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into -up to 8 root ports. The traffic from each root port can be filtered using RP or -BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will -capture traffic from all RPs. Please see below for more details. - -The events and configuration options of this PMU device are described in sysfs, -see /sys/bus/event_source/devices/nvidia_pcie_pmu__rc_. - -The events in this PMU can be used to measure bandwidth, utilization, and -latency: - - * rd_req: count the number of read requests by PCIE device. - * wr_req: count the number of write requests by PCIE device. - * rd_bytes: count the number of bytes transferred by rd_req. - * wr_bytes: count the number of bytes transferred by wr_req. - * rd_cum_outs: count outstanding rd_req each cycle. - * cycles: counts the PCIE cycles. - -The average bandwidth is calculated as:: - - AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS - AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS - -The average request rate is calculated as:: - - AVG_RD_REQUEST_RATE = RD_REQ / CYCLES - AVG_WR_REQUEST_RATE = WR_REQ / CYCLES - - -The average latency is calculated as:: - - FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS - AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ - AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ - -The PMU events can be filtered based on the traffic source and destination. -The source filter indicates the PCIE devices that will be monitored. The -destination filter specifies the destination memory type, e.g. local system -memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote -classification of the destination filter is based on the home socket of the -address, not where the data actually resides. These filters can be found in -/sys/bus/event_source/devices/nvidia_pcie_pmu__rc_/format/. - -The list of event filters: - -* Source filter: - - * src_rp_mask: bitmask of root ports that will be monitored. Each bit in this - bitmask represents the RP index in the RC. If the bit is set, all devices under - the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor - devices in root port 0 to 3. - * src_bdf: the BDF that will be monitored. This is a 16-bit value that - follows formula: (bus << 8) + (device << 3) + (function). For example, the - value of BDF 27:01.1 is 0x2781. - * src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in - "src_bdf" is used to filter the traffic. - - Note that Root-Port and BDF filters are mutually exclusive and the PMU in - each RC can only have one BDF filter for the whole counters. If BDF filter - is enabled, the BDF filter value will be applied to all events. - -* Destination filter: - - * dst_loc_cmem: if set, count events to local system memory (CMEM) address - * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address - * dst_loc_pcie_p2p: if set, count events to local PCIE peer address - * dst_loc_pcie_cxl: if set, count events to local CXL memory address - * dst_rem: if set, count events to remote memory address - -If the source filter is not specified, the PMU will count events from all root -ports. If the destination filter is not specified, the PMU will count events -to all destinations. - -Example usage: - -* Count event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all - destinations:: - - perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/ - -* Count event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and - targeting just local CMEM of socket 0:: - - perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/ - -* Count event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all - destinations:: - - perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/ - -* Count event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and - targeting just local CMEM of socket 1:: - - perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/ - -* Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all - destinations:: - - perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ - -Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA -Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space -for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register -contains the following information to map PCIE devices under the RP back to its RC# : - - - Bus# (byte 0xc) : bus number as reported by the lspci output - - Segment# (byte 0xd) : segment number as reported by the lspci output - - RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability - - RC# (byte 0xf): root complex number associated with the RP - - Socket# (byte 0x10): socket number associated with the RP - -Example script for mapping lspci BDF to RC# and socket#:: - - #!/bin/bash - while read bdf rest; do - dvsec4_reg=$(lspci -vv -s $bdf | awk ' - /Designated Vendor-Specific: Vendor=10de ID=0004/ { - match($0, /\[([0-9a-fA-F]+)/, arr); - print "0x" arr[1]; - exit - } - ') - if [ -n "$dvsec4_reg" ]; then - bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b) - segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b) - rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b) - rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b) - socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b) - echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket" - fi - done < <(lspci -d 10de:) - -Example output:: - - 0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00 - 0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00 - 0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00 - 0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00 - 0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00 - 0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00 - 0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00 - 0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00 - 0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00 - 0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00 - 0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00 - 0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01 - 000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01 - 000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01 - 000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01 - 000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01 - 000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01 - 000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01 - 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 - 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 - 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 3a5531d1f94c7..c67667097a3cd 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -8,7 +8,6 @@ #include #include -#include #include #include "arm_cspmu.h" @@ -29,19 +28,6 @@ #define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) #define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) -#define NV_PCIE_V2_PORT_COUNT 8ULL -#define NV_PCIE_V2_FILTER_ID_MASK GENMASK_ULL(24, 0) -#define NV_PCIE_V2_FILTER_PORT GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0) -#define NV_PCIE_V2_FILTER_BDF_VAL GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT) -#define NV_PCIE_V2_FILTER_BDF_EN BIT(24) -#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT) -#define NV_PCIE_V2_FILTER_DEFAULT NV_PCIE_V2_FILTER_PORT - -#define NV_PCIE_V2_DST_COUNT 5ULL -#define NV_PCIE_V2_FILTER2_ID_MASK GENMASK_ULL(4, 0) -#define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) -#define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST - #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -176,16 +162,6 @@ static struct attribute *ucf_pmu_event_attrs[] = { NULL, }; -static struct attribute *pcie_v2_pmu_event_attrs[] = { - ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), - ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), - ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), - ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), - ARM_CSPMU_EVENT_ATTR(rd_cum_outs, 0x4), - ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), - NULL, -}; - static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -226,19 +202,6 @@ static struct attribute *ucf_pmu_format_attrs[] = { NULL, }; -static struct attribute *pcie_v2_pmu_format_attrs[] = { - ARM_CSPMU_FORMAT_EVENT_ATTR, - ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"), - ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"), - ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"), - ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"), - NULL, -}; - static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -270,32 +233,6 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } -#if defined(CONFIG_ACPI) -static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) -{ - struct fwnode_handle *fwnode; - struct acpi_device *adev; - int ret; - - adev = arm_cspmu_acpi_dev_get(cspmu); - if (!adev) - return -ENODEV; - - fwnode = acpi_fwnode_handle(adev); - ret = fwnode_property_read_u32(fwnode, "instance_id", id); - if (ret) - dev_err(cspmu->dev, "Failed to get instance ID\n"); - - acpi_dev_put(adev); - return ret; -} -#else -static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) -{ - return -EINVAL; -} -#endif - static u32 nv_cspmu_event_filter(const struct perf_event *event) { const struct nv_cspmu_ctx *ctx = @@ -341,20 +278,6 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, } } -static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu, - const struct perf_event *event) -{ - const struct nv_cspmu_ctx *ctx = - to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); - const u32 offset = 4 * event->hw.idx; - - if (ctx->get_filter) - writel(0, cspmu->base0 + PMEVFILTR + offset); - - if (ctx->get_filter2) - writel(0, cspmu->base0 + PMEVFILT2R + offset); -} - static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { @@ -385,103 +308,9 @@ static u32 ucf_pmu_event_filter(const struct perf_event *event) return ret; } -static u32 pcie_v2_pmu_bdf_val_en(u32 filter) -{ - const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter); - - /* Returns both BDF value and enable bit if BDF filtering is enabled. */ - if (bdf_en) - return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter); - - /* Ignore the BDF value if BDF filter is not enabled. */ - return 0; -} - -static u32 pcie_v2_pmu_event_filter(const struct perf_event *event) -{ - u32 filter, lead_filter, lead_bdf; - struct perf_event *leader; - const struct nv_cspmu_ctx *ctx = - to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); - - filter = event->attr.config1 & ctx->filter_mask; - if (filter != 0) - return filter; - - leader = event->group_leader; - - /* Use leader's filter value if its BDF filtering is enabled. */ - if (event != leader) { - lead_filter = pcie_v2_pmu_event_filter(leader); - lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); - if (lead_bdf != 0) - return lead_filter; - } - - /* Otherwise, return default filter value. */ - return ctx->filter_default_val; -} - -static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, - struct perf_event *new_ev) -{ - /* - * Make sure the events are using same BDF filter since the PCIE-SRC PMU - * only supports one common BDF filter setting for all of the counters. - */ - - int idx; - u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf; - struct perf_event *leader, *new_leader; - - if (cspmu->impl.ops.is_cycle_counter_event(new_ev)) - return 0; - - new_leader = new_ev->group_leader; - - new_filter = pcie_v2_pmu_event_filter(new_ev); - new_lead_filter = pcie_v2_pmu_event_filter(new_leader); - - new_bdf = pcie_v2_pmu_bdf_val_en(new_filter); - new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter); - - new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter); - - if (new_rp != 0 && new_bdf != 0) { - dev_err(cspmu->dev, - "RP and BDF filtering are mutually exclusive\n"); - return -EINVAL; - } - - if (new_bdf != new_lead_bdf) { - dev_err(cspmu->dev, - "sibling and leader BDF value should be equal\n"); - return -EINVAL; - } - - /* Compare BDF filter on existing events. */ - idx = find_first_bit(cspmu->hw_events.used_ctrs, - cspmu->cycle_counter_logical_idx); - - if (idx != cspmu->cycle_counter_logical_idx) { - leader = cspmu->hw_events.events[idx]->group_leader; - - const u32 lead_filter = pcie_v2_pmu_event_filter(leader); - const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); - - if (new_lead_bdf != lead_bdf) { - dev_err(cspmu->dev, "only one BDF value is supported\n"); - return -EINVAL; - } - } - - return 0; -} - enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, - NAME_FMT_SOCKET, - NAME_FMT_SOCKET_INST + NAME_FMT_SOCKET }; struct nv_cspmu_match { @@ -601,27 +430,6 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .init_data = NULL }, }, - { - .prodid = 0x10301000, - .prodid_mask = NV_PRODID_MASK, - .name_pattern = "nvidia_pcie_pmu_%u_rc_%u", - .name_fmt = NAME_FMT_SOCKET_INST, - .template_ctx = { - .event_attr = pcie_v2_pmu_event_attrs, - .format_attr = pcie_v2_pmu_format_attrs, - .filter_mask = NV_PCIE_V2_FILTER_ID_MASK, - .filter_default_val = NV_PCIE_V2_FILTER_DEFAULT, - .filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK, - .filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT, - .get_filter = pcie_v2_pmu_event_filter, - .get_filter2 = nv_cspmu_event_filter2, - .init_data = NULL - }, - .ops = { - .validate_event = pcie_v2_pmu_validate_event, - .reset_ev_filter = nv_cspmu_reset_ev_filter, - } - }, { .prodid = 0, .prodid_mask = 0, @@ -645,7 +453,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, const struct nv_cspmu_match *match) { - char *name = NULL; + char *name; struct device *dev = cspmu->dev; static atomic_t pmu_generic_idx = {0}; @@ -659,16 +467,6 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, socket); break; } - case NAME_FMT_SOCKET_INST: { - const int cpu = cpumask_first(&cspmu->associated_cpus); - const int socket = cpu_to_node(cpu); - u32 inst_id; - - if (!nv_cspmu_get_inst_id(cspmu, &inst_id)) - name = devm_kasprintf(dev, GFP_KERNEL, - match->name_pattern, socket, inst_id); - break; - } case NAME_FMT_GENERIC: name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern, atomic_fetch_inc(&pmu_generic_idx)); @@ -716,10 +514,8 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ - SET_OP(validate_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); - SET_OP(reset_ev_filter, impl_ops, match, NULL); SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs); SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs); SET_OP(get_name, impl_ops, match, nv_cspmu_get_name); From 08c61448ccf5ad2c71ee655a251541b67dc836ae Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:25:52 -0700 Subject: [PATCH 06/17] Revert "NVIDIA: VR: SAUCE: perf/arm_cspmu: Add arm_cspmu_acpi_dev_get" This reverts commit a2ab08daa7b3ad98f34c89edb451053ede1f549d. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- drivers/perf/arm_cspmu/arm_cspmu.c | 24 +----------------------- drivers/perf/arm_cspmu/arm_cspmu.h | 17 +---------------- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index dadc9b765d801..34430b68f6025 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -16,7 +16,7 @@ * The user should refer to the vendor technical documentation to get details * about the supported events. * - * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -1132,28 +1132,6 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) return 0; } - -struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) -{ - char hid[16]; - char uid[16]; - struct acpi_device *adev; - const struct acpi_apmt_node *apmt_node; - - apmt_node = arm_cspmu_apmt_node(cspmu->dev); - if (!apmt_node || apmt_node->type != ACPI_APMT_NODE_TYPE_ACPI) - return NULL; - - memset(hid, 0, sizeof(hid)); - memset(uid, 0, sizeof(uid)); - - memcpy(hid, &apmt_node->inst_primary, sizeof(apmt_node->inst_primary)); - snprintf(uid, sizeof(uid), "%u", apmt_node->inst_secondary); - - adev = acpi_dev_get_first_match_dev(hid, uid, -1); - return adev; -} -EXPORT_SYMBOL_GPL(arm_cspmu_acpi_dev_get); #else static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) { diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index 3200966732003..cd65a58dbd884 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -1,14 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 * * ARM CoreSight Architecture PMU driver. - * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ #ifndef __ARM_CSPMU_H__ #define __ARM_CSPMU_H__ -#include #include #include #include @@ -256,18 +255,4 @@ int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match); /* Unregister vendor backend. */ void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match); -#if defined(CONFIG_ACPI) -/** - * Get ACPI device associated with the PMU. - * The caller is responsible for calling acpi_dev_put() on the returned device. - */ -struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu); -#else -static inline struct acpi_device * -arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) -{ - return NULL; -} -#endif - #endif /* __ARM_CSPMU_H__ */ From 02a9cdcca68faad819b47c27c351774e580350f7 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:25:56 -0700 Subject: [PATCH 07/17] Revert "NVIDIA: VR: SAUCE: perf/arm_cspmu: nvidia: Add Tegra410 UCF PMU" This reverts commit e12d030d5a967372345b960ac4130e4f112bf1d5. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- Documentation/admin-guide/perf/index.rst | 1 - .../admin-guide/perf/nvidia-tegra410-pmu.rst | 106 ------------------ drivers/perf/arm_cspmu/nvidia_cspmu.c | 90 +-------------- 3 files changed, 1 insertion(+), 196 deletions(-) delete mode 100644 Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index aa12708ddb965..c407bb44b08e3 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -25,7 +25,6 @@ Performance monitor support alibaba_pmu dwc_pcie_pmu nvidia-tegra241-pmu - nvidia-tegra410-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst deleted file mode 100644 index 7b7ba5700ca19..0000000000000 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ /dev/null @@ -1,106 +0,0 @@ -===================================================================== -NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU) -===================================================================== - -The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance -metrics like memory bandwidth, latency, and utilization: - -* Unified Coherence Fabric (UCF) - -PMU Driver ----------- - -The PMU driver describes the available events and configuration of each PMU in -sysfs. Please see the sections below to get the sysfs path of each PMU. Like -other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show -the CPU id used to handle the PMU event. There is also "associated_cpus" -sysfs attribute, which contains a list of CPUs associated with the PMU instance. - -UCF PMU -------- - -The Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a -distributed cache, last level for CPU Memory and CXL Memory, and cache coherent -interconnect that supports hardware coherence across multiple coherently caching -agents, including: - - * CPU clusters - * GPU - * PCIe Ordering Controller Unit (OCU) - * Other IO-coherent requesters - -The events and configuration options of this PMU device are described in sysfs, -see /sys/bus/event_source/devices/nvidia_ucf_pmu_. - -Some of the events available in this PMU can be used to measure bandwidth and -utilization: - - * slc_access_rd: count the number of read requests to SLC. - * slc_access_wr: count the number of write requests to SLC. - * slc_bytes_rd: count the number of bytes transferred by slc_access_rd. - * slc_bytes_wr: count the number of bytes transferred by slc_access_wr. - * mem_access_rd: count the number of read requests to local or remote memory. - * mem_access_wr: count the number of write requests to local or remote memory. - * mem_bytes_rd: count the number of bytes transferred by mem_access_rd. - * mem_bytes_wr: count the number of bytes transferred by mem_access_wr. - * cycles: counts the UCF cycles. - -The average bandwidth is calculated as:: - - AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS - AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS - AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS - AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS - -The average request rate is calculated as:: - - AVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES - AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES - AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES - AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES - -More details about what other events are available can be found in Tegra410 SoC -technical reference manual. - -The events can be filtered based on source or destination. The source filter -indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or -remote socket. The destination filter specifies the destination memory type, -e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The -local/remote classification of the destination filter is based on the home -socket of the address, not where the data actually resides. The available -filters are described in -/sys/bus/event_source/devices/nvidia_ucf_pmu_/format/. - -The list of UCF PMU event filters: - -* Source filter: - - * src_loc_cpu: if set, count events from local CPU - * src_loc_noncpu: if set, count events from local non-CPU device - * src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket - -* Destination filter: - - * dst_loc_cmem: if set, count events to local system memory (CMEM) address - * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address - * dst_loc_other: if set, count events to local CXL memory address - * dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket - -If the source is not specified, the PMU will count events from all sources. If -the destination is not specified, the PMU will count events to all destinations. - -Example usage: - -* Count event id 0x0 in socket 0 from all sources and to all destinations:: - - perf stat -a -e nvidia_ucf_pmu_0/event=0x0/ - -* Count event id 0x0 in socket 0 with source filter = local CPU and destination - filter = local system memory (CMEM):: - - perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/ - -* Count event id 0x0 in socket 1 with source filter = local non-CPU device and - destination filter = remote memory:: - - perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index c67667097a3cd..e06a06d3407b1 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -21,13 +21,6 @@ #define NV_CNVL_PORT_COUNT 4ULL #define NV_CNVL_FILTER_ID_MASK GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0) -#define NV_UCF_SRC_COUNT 3ULL -#define NV_UCF_DST_COUNT 4ULL -#define NV_UCF_FILTER_ID_MASK GENMASK_ULL(11, 0) -#define NV_UCF_FILTER_SRC GENMASK_ULL(2, 0) -#define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) -#define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) - #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -131,37 +124,6 @@ static struct attribute *mcf_pmu_event_attrs[] = { NULL, }; -static struct attribute *ucf_pmu_event_attrs[] = { - ARM_CSPMU_EVENT_ATTR(bus_cycles, 0x1D), - - ARM_CSPMU_EVENT_ATTR(slc_allocate, 0xF0), - ARM_CSPMU_EVENT_ATTR(slc_wb, 0xF3), - ARM_CSPMU_EVENT_ATTR(slc_refill_rd, 0x109), - ARM_CSPMU_EVENT_ATTR(slc_refill_wr, 0x10A), - ARM_CSPMU_EVENT_ATTR(slc_hit_rd, 0x119), - - ARM_CSPMU_EVENT_ATTR(slc_access_dataless, 0x183), - ARM_CSPMU_EVENT_ATTR(slc_access_atomic, 0x184), - - ARM_CSPMU_EVENT_ATTR(slc_access, 0xF2), - ARM_CSPMU_EVENT_ATTR(slc_access_rd, 0x111), - ARM_CSPMU_EVENT_ATTR(slc_access_wr, 0x112), - ARM_CSPMU_EVENT_ATTR(slc_bytes_rd, 0x113), - ARM_CSPMU_EVENT_ATTR(slc_bytes_wr, 0x114), - - ARM_CSPMU_EVENT_ATTR(mem_access_rd, 0x121), - ARM_CSPMU_EVENT_ATTR(mem_access_wr, 0x122), - ARM_CSPMU_EVENT_ATTR(mem_bytes_rd, 0x123), - ARM_CSPMU_EVENT_ATTR(mem_bytes_wr, 0x124), - - ARM_CSPMU_EVENT_ATTR(local_snoop, 0x180), - ARM_CSPMU_EVENT_ATTR(ext_snp_access, 0x181), - ARM_CSPMU_EVENT_ATTR(ext_snp_evict, 0x182), - - ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), - NULL, -}; - static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -190,18 +152,6 @@ static struct attribute *cnvlink_pmu_format_attrs[] = { NULL, }; -static struct attribute *ucf_pmu_format_attrs[] = { - ARM_CSPMU_FORMAT_EVENT_ATTR, - ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"), - ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"), - ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"), - ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"), - ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"), - NULL, -}; - static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -286,27 +236,6 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, writel(filter, cspmu->base0 + PMCCFILTR); } -static u32 ucf_pmu_event_filter(const struct perf_event *event) -{ - u32 ret, filter, src, dst; - - filter = nv_cspmu_event_filter(event); - - /* Monitor all sources if none is selected. */ - src = FIELD_GET(NV_UCF_FILTER_SRC, filter); - if (src == 0) - src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0); - - /* Monitor all destinations if none is selected. */ - dst = FIELD_GET(NV_UCF_FILTER_DST, filter); - if (dst == 0) - dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0); - - ret = FIELD_PREP(NV_UCF_FILTER_SRC, src); - ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst); - - return ret; -} enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, @@ -413,23 +342,6 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .init_data = NULL }, }, - { - .prodid = 0x2CF20000, - .prodid_mask = NV_PRODID_MASK, - .name_pattern = "nvidia_ucf_pmu_%u", - .name_fmt = NAME_FMT_SOCKET, - .template_ctx = { - .event_attr = ucf_pmu_event_attrs, - .format_attr = ucf_pmu_format_attrs, - .filter_mask = NV_UCF_FILTER_ID_MASK, - .filter_default_val = NV_UCF_FILTER_DEFAULT, - .filter2_mask = 0x0, - .filter2_default_val = 0x0, - .get_filter = ucf_pmu_event_filter, - .get_filter2 = NULL, - .init_data = NULL - }, - }, { .prodid = 0, .prodid_mask = 0, From 5178a81d9b4d190f3b35c46346e514ec6e7359a3 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 16 Apr 2026 17:26:00 -0700 Subject: [PATCH 08/17] Revert "NVIDIA: VR: SAUCE: perf/arm_cspmu: nvidia: Rename doc to Tegra241" This reverts commit 575f7ef512986af745ef7346bc3badda9a4a7ec8. This will be replaced by the equivalent patch from v7.1. Signed-off-by: Matthew R. Ochs --- Documentation/admin-guide/perf/index.rst | 2 +- .../perf/{nvidia-tegra241-pmu.rst => nvidia-pmu.rst} | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename Documentation/admin-guide/perf/{nvidia-tegra241-pmu.rst => nvidia-pmu.rst} (98%) diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index c407bb44b08e3..47d9a3df6329b 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -24,7 +24,7 @@ Performance monitor support thunderx2-pmu alibaba_pmu dwc_pcie_pmu - nvidia-tegra241-pmu + nvidia-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst b/Documentation/admin-guide/perf/nvidia-pmu.rst similarity index 98% rename from Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst rename to Documentation/admin-guide/perf/nvidia-pmu.rst index fad5bc4cee6c0..f538ef67e0e8f 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-pmu.rst @@ -1,8 +1,8 @@ -============================================================ -NVIDIA Tegra241 SoC Uncore Performance Monitoring Unit (PMU) -============================================================ +========================================================= +NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU) +========================================================= -The NVIDIA Tegra241 SoC includes various system PMUs to measure key performance +The NVIDIA Tegra SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Scalable Coherency Fabric (SCF) From 4729da47268d7378110c8dc1815cf63715a3cb82 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:45 +0000 Subject: [PATCH 09/17] perf/arm_cspmu: nvidia: Rename doc to Tegra241 The documentation in nvidia-pmu.rst contains PMUs specific to NVIDIA Tegra241 SoC. Rename the file for this specific SoC to have better distinction with other NVIDIA SoC. Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit d332424d1d06a9fb03ca04ba3f1092c3990125e8) Signed-off-by: Matthew R. Ochs --- Documentation/admin-guide/perf/index.rst | 2 +- .../perf/{nvidia-pmu.rst => nvidia-tegra241-pmu.rst} | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename Documentation/admin-guide/perf/{nvidia-pmu.rst => nvidia-tegra241-pmu.rst} (98%) diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 47d9a3df6329b..c407bb44b08e3 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -24,7 +24,7 @@ Performance monitor support thunderx2-pmu alibaba_pmu dwc_pcie_pmu - nvidia-pmu + nvidia-tegra241-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst similarity index 98% rename from Documentation/admin-guide/perf/nvidia-pmu.rst rename to Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst index f538ef67e0e8f..fad5bc4cee6c0 100644 --- a/Documentation/admin-guide/perf/nvidia-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst @@ -1,8 +1,8 @@ -========================================================= -NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU) -========================================================= +============================================================ +NVIDIA Tegra241 SoC Uncore Performance Monitoring Unit (PMU) +============================================================ -The NVIDIA Tegra SoC includes various system PMUs to measure key performance +The NVIDIA Tegra241 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Scalable Coherency Fabric (SCF) From 2eadbf648c10b808d405bf1ee4587540ce74f729 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:46 +0000 Subject: [PATCH 10/17] perf/arm_cspmu: nvidia: Add Tegra410 UCF PMU The Unified Coherence Fabric (UCF) contains last level cache and cache coherent interconnect in Tegra410 SOC. The PMU in this device can be used to capture events related to access to the last level cache and memory from different sources. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit f5caf26fd6c71294d0fb254404ed66f8cff6f7f7) Signed-off-by: Matthew R. Ochs --- Documentation/admin-guide/perf/index.rst | 1 + .../admin-guide/perf/nvidia-tegra410-pmu.rst | 106 ++++++++++++++++++ drivers/perf/arm_cspmu/nvidia_cspmu.c | 87 +++++++++++++- 3 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index c407bb44b08e3..aa12708ddb965 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -25,6 +25,7 @@ Performance monitor support alibaba_pmu dwc_pcie_pmu nvidia-tegra241-pmu + nvidia-tegra410-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst new file mode 100644 index 0000000000000..7b7ba5700ca19 --- /dev/null +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -0,0 +1,106 @@ +===================================================================== +NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU) +===================================================================== + +The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance +metrics like memory bandwidth, latency, and utilization: + +* Unified Coherence Fabric (UCF) + +PMU Driver +---------- + +The PMU driver describes the available events and configuration of each PMU in +sysfs. Please see the sections below to get the sysfs path of each PMU. Like +other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show +the CPU id used to handle the PMU event. There is also "associated_cpus" +sysfs attribute, which contains a list of CPUs associated with the PMU instance. + +UCF PMU +------- + +The Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a +distributed cache, last level for CPU Memory and CXL Memory, and cache coherent +interconnect that supports hardware coherence across multiple coherently caching +agents, including: + + * CPU clusters + * GPU + * PCIe Ordering Controller Unit (OCU) + * Other IO-coherent requesters + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_ucf_pmu_. + +Some of the events available in this PMU can be used to measure bandwidth and +utilization: + + * slc_access_rd: count the number of read requests to SLC. + * slc_access_wr: count the number of write requests to SLC. + * slc_bytes_rd: count the number of bytes transferred by slc_access_rd. + * slc_bytes_wr: count the number of bytes transferred by slc_access_wr. + * mem_access_rd: count the number of read requests to local or remote memory. + * mem_access_wr: count the number of write requests to local or remote memory. + * mem_bytes_rd: count the number of bytes transferred by mem_access_rd. + * mem_bytes_wr: count the number of bytes transferred by mem_access_wr. + * cycles: counts the UCF cycles. + +The average bandwidth is calculated as:: + + AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS + AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS + AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS + AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES + AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES + AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES + AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES + +More details about what other events are available can be found in Tegra410 SoC +technical reference manual. + +The events can be filtered based on source or destination. The source filter +indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or +remote socket. The destination filter specifies the destination memory type, +e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The +local/remote classification of the destination filter is based on the home +socket of the address, not where the data actually resides. The available +filters are described in +/sys/bus/event_source/devices/nvidia_ucf_pmu_/format/. + +The list of UCF PMU event filters: + +* Source filter: + + * src_loc_cpu: if set, count events from local CPU + * src_loc_noncpu: if set, count events from local non-CPU device + * src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_other: if set, count events to local CXL memory address + * dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket + +If the source is not specified, the PMU will count events from all sources. If +the destination is not specified, the PMU will count events to all destinations. + +Example usage: + +* Count event id 0x0 in socket 0 from all sources and to all destinations:: + + perf stat -a -e nvidia_ucf_pmu_0/event=0x0/ + +* Count event id 0x0 in socket 0 with source filter = local CPU and destination + filter = local system memory (CMEM):: + + perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/ + +* Count event id 0x0 in socket 1 with source filter = local non-CPU device and + destination filter = remote memory:: + + perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index e06a06d3407b1..8e37cbe3bae99 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -21,6 +21,13 @@ #define NV_CNVL_PORT_COUNT 4ULL #define NV_CNVL_FILTER_ID_MASK GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0) +#define NV_UCF_SRC_COUNT 3ULL +#define NV_UCF_DST_COUNT 4ULL +#define NV_UCF_FILTER_ID_MASK GENMASK_ULL(11, 0) +#define NV_UCF_FILTER_SRC GENMASK_ULL(2, 0) +#define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) +#define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -124,6 +131,36 @@ static struct attribute *mcf_pmu_event_attrs[] = { NULL, }; +static struct attribute *ucf_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(bus_cycles, 0x1D), + + ARM_CSPMU_EVENT_ATTR(slc_allocate, 0xF0), + ARM_CSPMU_EVENT_ATTR(slc_wb, 0xF3), + ARM_CSPMU_EVENT_ATTR(slc_refill_rd, 0x109), + ARM_CSPMU_EVENT_ATTR(slc_refill_wr, 0x10A), + ARM_CSPMU_EVENT_ATTR(slc_hit_rd, 0x119), + + ARM_CSPMU_EVENT_ATTR(slc_access_dataless, 0x183), + ARM_CSPMU_EVENT_ATTR(slc_access_atomic, 0x184), + + ARM_CSPMU_EVENT_ATTR(slc_access_rd, 0x111), + ARM_CSPMU_EVENT_ATTR(slc_access_wr, 0x112), + ARM_CSPMU_EVENT_ATTR(slc_bytes_rd, 0x113), + ARM_CSPMU_EVENT_ATTR(slc_bytes_wr, 0x114), + + ARM_CSPMU_EVENT_ATTR(mem_access_rd, 0x121), + ARM_CSPMU_EVENT_ATTR(mem_access_wr, 0x122), + ARM_CSPMU_EVENT_ATTR(mem_bytes_rd, 0x123), + ARM_CSPMU_EVENT_ATTR(mem_bytes_wr, 0x124), + + ARM_CSPMU_EVENT_ATTR(local_snoop, 0x180), + ARM_CSPMU_EVENT_ATTR(ext_snp_access, 0x181), + ARM_CSPMU_EVENT_ATTR(ext_snp_evict, 0x182), + + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -152,6 +189,18 @@ static struct attribute *cnvlink_pmu_format_attrs[] = { NULL, }; +static struct attribute *ucf_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"), + ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"), + ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -236,6 +285,27 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, writel(filter, cspmu->base0 + PMCCFILTR); } +static u32 ucf_pmu_event_filter(const struct perf_event *event) +{ + u32 ret, filter, src, dst; + + filter = nv_cspmu_event_filter(event); + + /* Monitor all sources if none is selected. */ + src = FIELD_GET(NV_UCF_FILTER_SRC, filter); + if (src == 0) + src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0); + + /* Monitor all destinations if none is selected. */ + dst = FIELD_GET(NV_UCF_FILTER_DST, filter); + if (dst == 0) + dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0); + + ret = FIELD_PREP(NV_UCF_FILTER_SRC, src); + ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst); + + return ret; +} enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, @@ -342,6 +412,21 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .init_data = NULL }, }, + { + .prodid = 0x2CF20000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_ucf_pmu_%u", + .name_fmt = NAME_FMT_SOCKET, + .template_ctx = { + .event_attr = ucf_pmu_event_attrs, + .format_attr = ucf_pmu_format_attrs, + .filter_mask = NV_UCF_FILTER_ID_MASK, + .filter_default_val = NV_UCF_FILTER_DEFAULT, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = ucf_pmu_event_filter, + }, + }, { .prodid = 0, .prodid_mask = 0, From 1f7f669846745d69b58c109ffbf108f202c13880 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:47 +0000 Subject: [PATCH 11/17] perf/arm_cspmu: Add arm_cspmu_acpi_dev_get Add interface to get ACPI device associated with the PMU. This ACPI device may contain additional properties not covered by the standard properties. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit bc86281fe4bd5d4a78be2f370e8319c9517e40ff) Signed-off-by: Matthew R. Ochs --- drivers/perf/arm_cspmu/arm_cspmu.c | 19 ++++++++++++++++++- drivers/perf/arm_cspmu/arm_cspmu.h | 17 ++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 34430b68f6025..49e8a1f381319 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -16,7 +16,7 @@ * The user should refer to the vendor technical documentation to get details * about the supported events. * - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -1132,6 +1132,23 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) return 0; } + +struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) +{ + char hid[16] = {}; + char uid[16] = {}; + const struct acpi_apmt_node *apmt_node; + + apmt_node = arm_cspmu_apmt_node(cspmu->dev); + if (!apmt_node || apmt_node->type != ACPI_APMT_NODE_TYPE_ACPI) + return NULL; + + memcpy(hid, &apmt_node->inst_primary, sizeof(apmt_node->inst_primary)); + snprintf(uid, sizeof(uid), "%u", apmt_node->inst_secondary); + + return acpi_dev_get_first_match_dev(hid, uid, -1); +} +EXPORT_SYMBOL_GPL(arm_cspmu_acpi_dev_get); #else static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) { diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index cd65a58dbd884..3fc5c8d772663 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 * * ARM CoreSight Architecture PMU driver. - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ #ifndef __ARM_CSPMU_H__ #define __ARM_CSPMU_H__ +#include #include #include #include @@ -255,4 +256,18 @@ int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match); /* Unregister vendor backend. */ void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match); +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +/** + * Get ACPI device associated with the PMU. + * The caller is responsible for calling acpi_dev_put() on the returned device. + */ +struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu); +#else +static inline struct acpi_device * +arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) +{ + return NULL; +} +#endif + #endif /* __ARM_CSPMU_H__ */ From f0aab13d4398fae1c04e54c76d1a385fddc0450b Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:48 +0000 Subject: [PATCH 12/17] perf/arm_cspmu: nvidia: Add Tegra410 PCIE PMU Adds PCIE PMU support in Tegra410 SOC. This PMU is instanced in each root complex in the SOC and can capture traffic from PCIE device to various memory types. This PMU can filter traffic based on the originating root port or BDF and the target memory types (CPU DRAM, GPU Memory, CXL Memory, or remote Memory). Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit bf585ba14726788335c640512d11186dab573612) Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 163 ++++++++++++++ drivers/perf/arm_cspmu/nvidia_cspmu.c | 210 +++++++++++++++++- 2 files changed, 368 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 7b7ba5700ca19..b8cfbb80be1c1 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -6,6 +6,7 @@ The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) +* PCIE PMU Driver ---------- @@ -104,3 +105,165 @@ Example usage: destination filter = remote memory:: perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ + +PCIE PMU +-------- + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors all read/write traffic from the root port(s) +or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per +PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into +up to 8 root ports. The traffic from each root port can be filtered using RP or +BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will +capture traffic from all RPs. Please see below for more details. + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_pmu__rc_. + +The events in this PMU can be used to measure bandwidth, utilization, and +latency: + + * rd_req: count the number of read requests by PCIE device. + * wr_req: count the number of write requests by PCIE device. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * rd_cum_outs: count outstanding rd_req each cycle. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The PMU events can be filtered based on the traffic source and destination. +The source filter indicates the PCIE devices that will be monitored. The +destination filter specifies the destination memory type, e.g. local system +memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote +classification of the destination filter is based on the home socket of the +address, not where the data actually resides. These filters can be found in +/sys/bus/event_source/devices/nvidia_pcie_pmu__rc_/format/. + +The list of event filters: + +* Source filter: + + * src_rp_mask: bitmask of root ports that will be monitored. Each bit in this + bitmask represents the RP index in the RC. If the bit is set, all devices under + the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor + devices in root port 0 to 3. + * src_bdf: the BDF that will be monitored. This is a 16-bit value that + follows formula: (bus << 8) + (device << 3) + (function). For example, the + value of BDF 27:01.1 is 0x2781. + * src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in + "src_bdf" is used to filter the traffic. + + Note that Root-Port and BDF filters are mutually exclusive and the PMU in + each RC can only have one BDF filter for the whole counters. If BDF filter + is enabled, the BDF filter value will be applied to all events. + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_pcie_p2p: if set, count events to local PCIE peer address + * dst_loc_pcie_cxl: if set, count events to local CXL memory address + * dst_rem: if set, count events to remote memory address + +If the source filter is not specified, the PMU will count events from all root +ports. If the destination filter is not specified, the PMU will count events +to all destinations. + +Example usage: + +* Count event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/ + +* Count event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and + targeting just local CMEM of socket 0:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/ + +* Count event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and + targeting just local CMEM of socket 1:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ + +Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA +Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space +for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register +contains the following information to map PCIE devices under the RP back to its RC# : + + - Bus# (byte 0xc) : bus number as reported by the lspci output + - Segment# (byte 0xd) : segment number as reported by the lspci output + - RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability + - RC# (byte 0xf): root complex number associated with the RP + - Socket# (byte 0x10): socket number associated with the RP + +Example script for mapping lspci BDF to RC# and socket#:: + + #!/bin/bash + while read bdf rest; do + dvsec4_reg=$(lspci -vv -s $bdf | awk ' + /Designated Vendor-Specific: Vendor=10de ID=0004/ { + match($0, /\[([0-9a-fA-F]+)/, arr); + print "0x" arr[1]; + exit + } + ') + if [ -n "$dvsec4_reg" ]; then + bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b) + segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b) + rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b) + rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b) + socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b) + echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket" + fi + done < <(lspci -d 10de:) + +Example output:: + + 0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00 + 0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00 + 0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00 + 0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00 + 0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00 + 0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00 + 0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00 + 0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00 + 0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00 + 0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00 + 0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00 + 0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01 + 000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01 + 000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01 + 000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01 + 000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01 + 000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01 + 000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01 + 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 + 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 + 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 8e37cbe3bae99..61fde84ea3434 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -8,6 +8,7 @@ #include #include +#include #include #include "arm_cspmu.h" @@ -28,6 +29,19 @@ #define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) #define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) +#define NV_PCIE_V2_PORT_COUNT 8ULL +#define NV_PCIE_V2_FILTER_ID_MASK GENMASK_ULL(24, 0) +#define NV_PCIE_V2_FILTER_PORT GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER_BDF_VAL GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_BDF_EN BIT(24) +#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_DEFAULT NV_PCIE_V2_FILTER_PORT + +#define NV_PCIE_V2_DST_COUNT 5ULL +#define NV_PCIE_V2_FILTER2_ID_MASK GENMASK_ULL(4, 0) +#define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -161,6 +175,16 @@ static struct attribute *ucf_pmu_event_attrs[] = { NULL }; +static struct attribute *pcie_v2_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(rd_cum_outs, 0x4), + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -201,6 +225,19 @@ static struct attribute *ucf_pmu_format_attrs[] = { NULL }; +static struct attribute *pcie_v2_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"), + ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"), + ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -232,6 +269,32 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + struct fwnode_handle *fwnode; + struct acpi_device *adev; + int ret; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) + return -ENODEV; + + fwnode = acpi_fwnode_handle(adev); + ret = fwnode_property_read_u32(fwnode, "instance_id", id); + if (ret) + dev_err(cspmu->dev, "Failed to get instance ID\n"); + + acpi_dev_put(adev); + return ret; +} +#else +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + return -EINVAL; +} +#endif + static u32 nv_cspmu_event_filter(const struct perf_event *event) { const struct nv_cspmu_ctx *ctx = @@ -277,6 +340,20 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, } } +static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + const u32 offset = 4 * event->hw.idx; + + if (ctx->get_filter) + writel(0, cspmu->base0 + PMEVFILTR + offset); + + if (ctx->get_filter2) + writel(0, cspmu->base0 + PMEVFILT2R + offset); +} + static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { @@ -307,9 +384,103 @@ static u32 ucf_pmu_event_filter(const struct perf_event *event) return ret; } +static u32 pcie_v2_pmu_bdf_val_en(u32 filter) +{ + const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter); + + /* Returns both BDF value and enable bit if BDF filtering is enabled. */ + if (bdf_en) + return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter); + + /* Ignore the BDF value if BDF filter is not enabled. */ + return 0; +} + +static u32 pcie_v2_pmu_event_filter(const struct perf_event *event) +{ + u32 filter, lead_filter, lead_bdf; + struct perf_event *leader; + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + + filter = event->attr.config1 & ctx->filter_mask; + if (filter != 0) + return filter; + + leader = event->group_leader; + + /* Use leader's filter value if its BDF filtering is enabled. */ + if (event != leader) { + lead_filter = pcie_v2_pmu_event_filter(leader); + lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + if (lead_bdf != 0) + return lead_filter; + } + + /* Otherwise, return default filter value. */ + return ctx->filter_default_val; +} + +static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + /* + * Make sure the events are using same BDF filter since the PCIE-SRC PMU + * only supports one common BDF filter setting for all of the counters. + */ + + int idx; + u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf; + struct perf_event *new_leader; + + if (cspmu->impl.ops.is_cycle_counter_event(new_ev)) + return 0; + + new_leader = new_ev->group_leader; + + new_filter = pcie_v2_pmu_event_filter(new_ev); + new_lead_filter = pcie_v2_pmu_event_filter(new_leader); + + new_bdf = pcie_v2_pmu_bdf_val_en(new_filter); + new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter); + + new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter); + + if (new_rp != 0 && new_bdf != 0) { + dev_err(cspmu->dev, + "RP and BDF filtering are mutually exclusive\n"); + return -EINVAL; + } + + if (new_bdf != new_lead_bdf) { + dev_err(cspmu->dev, + "sibling and leader BDF value should be equal\n"); + return -EINVAL; + } + + /* Compare BDF filter on existing events. */ + idx = find_first_bit(cspmu->hw_events.used_ctrs, + cspmu->cycle_counter_logical_idx); + + if (idx != cspmu->cycle_counter_logical_idx) { + struct perf_event *leader = cspmu->hw_events.events[idx]->group_leader; + + const u32 lead_filter = pcie_v2_pmu_event_filter(leader); + const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + + if (new_lead_bdf != lead_bdf) { + dev_err(cspmu->dev, "only one BDF value is supported\n"); + return -EINVAL; + } + } + + return 0; +} + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, - NAME_FMT_SOCKET + NAME_FMT_SOCKET, + NAME_FMT_SOCKET_INST, }; struct nv_cspmu_match { @@ -427,6 +598,26 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .get_filter = ucf_pmu_event_filter, }, }, + { + .prodid = 0x10301000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_v2_pmu_event_attrs, + .format_attr = pcie_v2_pmu_format_attrs, + .filter_mask = NV_PCIE_V2_FILTER_ID_MASK, + .filter_default_val = NV_PCIE_V2_FILTER_DEFAULT, + .filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK, + .filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT, + .get_filter = pcie_v2_pmu_event_filter, + .get_filter2 = nv_cspmu_event_filter2, + }, + .ops = { + .validate_event = pcie_v2_pmu_validate_event, + .reset_ev_filter = nv_cspmu_reset_ev_filter, + } + }, { .prodid = 0, .prodid_mask = 0, @@ -450,7 +641,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, const struct nv_cspmu_match *match) { - char *name; + char *name = NULL; struct device *dev = cspmu->dev; static atomic_t pmu_generic_idx = {0}; @@ -464,13 +655,20 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, socket); break; } + case NAME_FMT_SOCKET_INST: { + const int cpu = cpumask_first(&cspmu->associated_cpus); + const int socket = cpu_to_node(cpu); + u32 inst_id; + + if (!nv_cspmu_get_inst_id(cspmu, &inst_id)) + name = devm_kasprintf(dev, GFP_KERNEL, + match->name_pattern, socket, inst_id); + break; + } case NAME_FMT_GENERIC: name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern, atomic_fetch_inc(&pmu_generic_idx)); break; - default: - name = NULL; - break; } return name; @@ -511,8 +709,10 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ + SET_OP(validate_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); + SET_OP(reset_ev_filter, impl_ops, match, NULL); SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs); SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs); SET_OP(get_name, impl_ops, match, nv_cspmu_get_name); From 1ba0cbbe4128c3a93fe179942be184edd7c50dcb Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:49 +0000 Subject: [PATCH 13/17] perf/arm_cspmu: nvidia: Add Tegra410 PCIE-TGT PMU Adds PCIE-TGT PMU support in Tegra410 SOC. This PMU is instanced in each root complex in the SOC and it captures traffic originating from any source towards PCIE BAR and CXL HDM range. The traffic can be filtered based on the destination root port or target address range. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit 3dd73022306bfdb29b1c33cb106fe337f46a6105) Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 77 +++++ drivers/perf/arm_cspmu/nvidia_cspmu.c | 321 ++++++++++++++++++ 2 files changed, 398 insertions(+) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index b8cfbb80be1c1..c065764d41fea 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -7,6 +7,7 @@ metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) * PCIE +* PCIE-TGT PMU Driver ---------- @@ -212,6 +213,11 @@ Example usage: perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ +.. _NVIDIA_T410_PCIE_PMU_RC_Mapping_Section: + +Mapping the RC# to lspci segment number +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register @@ -267,3 +273,74 @@ Example output:: 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 + +PCIE-TGT PMU +------------ + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors traffic targeting PCIE BAR and CXL HDM ranges. +There is one PCIE-TGT PMU per PCIE RC in the SoC. Each RC in Tegra410 SoC can +have up to 16 lanes that can be bifurcated into up to 8 root ports (RP). The PMU +provides RP filter to count PCIE BAR traffic to each RP and address filter to +count access to PCIE BAR or CXL HDM ranges. The details of the filters are +described in the following sections. + +Mapping the RC# to lspci segment number is similar to the PCIE PMU. Please see +:ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section` for more info. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_. + +The events in this PMU can be used to measure bandwidth and utilization: + + * rd_req: count the number of read requests to PCIE. + * wr_req: count the number of write requests to PCIE. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + +The PMU events can be filtered based on the destination root port or target +address range. Filtering based on RP is only available for PCIE BAR traffic. +Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be +found in sysfs, see +/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_/format/. + +Destination filter settings: + +* dst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF" + corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is + only available for PCIE BAR traffic. +* dst_addr_base: BAR or CXL HDM filter base address. +* dst_addr_mask: BAR or CXL HDM filter address mask. +* dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the + address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter + the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison + to determine if the traffic destination address falls within the filter range:: + + (txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask) + + If the comparison succeeds, then the event will be counted. + +If the destination filter is not specified, the RP filter will be configured by default +to count PCIE BAR traffic to all root ports. + +Example usage: + +* Count event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0:: + + perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/ + +* Count event id 0x1 for accesses to PCIE BAR or CXL HDM address range + 0x10000 to 0x100FF on socket 0's PCIE RC-1:: + + perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 61fde84ea3434..bac83e424d6dc 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -42,6 +42,24 @@ #define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) #define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST +#define NV_PCIE_TGT_PORT_COUNT 8ULL +#define NV_PCIE_TGT_EV_TYPE_CC 0x4 +#define NV_PCIE_TGT_EV_TYPE_COUNT 3ULL +#define NV_PCIE_TGT_EV_TYPE_MASK GENMASK_ULL(NV_PCIE_TGT_EV_TYPE_COUNT - 1, 0) +#define NV_PCIE_TGT_FILTER2_MASK GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT, 0) +#define NV_PCIE_TGT_FILTER2_PORT GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT - 1, 0) +#define NV_PCIE_TGT_FILTER2_ADDR_EN BIT(NV_PCIE_TGT_PORT_COUNT) +#define NV_PCIE_TGT_FILTER2_ADDR GENMASK_ULL(15, NV_PCIE_TGT_PORT_COUNT) +#define NV_PCIE_TGT_FILTER2_DEFAULT NV_PCIE_TGT_FILTER2_PORT + +#define NV_PCIE_TGT_ADDR_COUNT 8ULL +#define NV_PCIE_TGT_ADDR_STRIDE 20 +#define NV_PCIE_TGT_ADDR_CTRL 0xD38 +#define NV_PCIE_TGT_ADDR_BASE_LO 0xD3C +#define NV_PCIE_TGT_ADDR_BASE_HI 0xD40 +#define NV_PCIE_TGT_ADDR_MASK_LO 0xD44 +#define NV_PCIE_TGT_ADDR_MASK_HI 0xD48 + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -185,6 +203,15 @@ static struct attribute *pcie_v2_pmu_event_attrs[] = { NULL }; +static struct attribute *pcie_tgt_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(cycles, NV_PCIE_TGT_EV_TYPE_CC), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -238,6 +265,15 @@ static struct attribute *pcie_v2_pmu_format_attrs[] = { NULL }; +static struct attribute *pcie_tgt_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_ATTR(event, "config:0-2"), + ARM_CSPMU_FORMAT_ATTR(dst_rp_mask, "config:3-10"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_en, "config:11"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_base, "config1:0-63"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_mask, "config2:0-63"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -477,6 +513,267 @@ static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, return 0; } +struct pcie_tgt_addr_filter { + u32 refcount; + u64 base; + u64 mask; +}; + +struct pcie_tgt_data { + struct pcie_tgt_addr_filter addr_filter[NV_PCIE_TGT_ADDR_COUNT]; + void __iomem *addr_filter_reg; +}; + +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int pcie_tgt_init_data(struct arm_cspmu *cspmu) +{ + int ret; + struct acpi_device *adev; + struct pcie_tgt_data *data; + struct list_head resource_list; + struct resource_entry *rentry; + struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); + struct device *dev = cspmu->dev; + + data = devm_kzalloc(dev, sizeof(struct pcie_tgt_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) { + dev_err(dev, "failed to get associated PCIE-TGT device\n"); + return -ENODEV; + } + + INIT_LIST_HEAD(&resource_list); + ret = acpi_dev_get_memory_resources(adev, &resource_list); + if (ret < 0) { + dev_err(dev, "failed to get PCIE-TGT device memory resources\n"); + acpi_dev_put(adev); + return ret; + } + + rentry = list_first_entry_or_null( + &resource_list, struct resource_entry, node); + if (rentry) { + data->addr_filter_reg = devm_ioremap_resource(dev, rentry->res); + ret = 0; + } + + if (IS_ERR(data->addr_filter_reg)) { + dev_err(dev, "failed to get address filter resource\n"); + ret = PTR_ERR(data->addr_filter_reg); + } + + acpi_dev_free_resource_list(&resource_list); + acpi_dev_put(adev); + + ctx->data = data; + + return ret; +} +#else +static int pcie_tgt_init_data(struct arm_cspmu *cspmu) +{ + return -ENODEV; +} +#endif + +static struct pcie_tgt_data *pcie_tgt_get_data(struct arm_cspmu *cspmu) +{ + struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); + + return ctx->data; +} + +/* Find the first available address filter slot. */ +static int pcie_tgt_find_addr_idx(struct arm_cspmu *cspmu, u64 base, u64 mask, + bool is_reset) +{ + int i; + struct pcie_tgt_data *data = pcie_tgt_get_data(cspmu); + + for (i = 0; i < NV_PCIE_TGT_ADDR_COUNT; i++) { + if (!is_reset && data->addr_filter[i].refcount == 0) + return i; + + if (data->addr_filter[i].base == base && + data->addr_filter[i].mask == mask) + return i; + } + + return -ENODEV; +} + +static u32 pcie_tgt_pmu_event_filter(const struct perf_event *event) +{ + u32 filter; + + filter = (event->attr.config >> NV_PCIE_TGT_EV_TYPE_COUNT) & + NV_PCIE_TGT_FILTER2_MASK; + + return filter; +} + +static bool pcie_tgt_pmu_addr_en(const struct perf_event *event) +{ + u32 filter = pcie_tgt_pmu_event_filter(event); + + return FIELD_GET(NV_PCIE_TGT_FILTER2_ADDR_EN, filter) != 0; +} + +static u32 pcie_tgt_pmu_port_filter(const struct perf_event *event) +{ + u32 filter = pcie_tgt_pmu_event_filter(event); + + return FIELD_GET(NV_PCIE_TGT_FILTER2_PORT, filter); +} + +static u64 pcie_tgt_pmu_dst_addr_base(const struct perf_event *event) +{ + return event->attr.config1; +} + +static u64 pcie_tgt_pmu_dst_addr_mask(const struct perf_event *event) +{ + return event->attr.config2; +} + +static int pcie_tgt_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + u64 base, mask; + int idx; + + if (!pcie_tgt_pmu_addr_en(new_ev)) + return 0; + + /* Make sure there is a slot available for the address filter. */ + base = pcie_tgt_pmu_dst_addr_base(new_ev); + mask = pcie_tgt_pmu_dst_addr_mask(new_ev); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); + if (idx < 0) + return -EINVAL; + + return 0; +} + +static void pcie_tgt_pmu_config_addr_filter(struct arm_cspmu *cspmu, + bool en, u64 base, u64 mask, int idx) +{ + struct pcie_tgt_data *data; + struct pcie_tgt_addr_filter *filter; + void __iomem *filter_reg; + + data = pcie_tgt_get_data(cspmu); + filter = &data->addr_filter[idx]; + filter_reg = data->addr_filter_reg + (idx * NV_PCIE_TGT_ADDR_STRIDE); + + if (en) { + filter->refcount++; + if (filter->refcount == 1) { + filter->base = base; + filter->mask = mask; + + writel(lower_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); + writel(upper_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); + writel(lower_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); + writel(upper_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); + writel(1, filter_reg + NV_PCIE_TGT_ADDR_CTRL); + } + } else { + filter->refcount--; + if (filter->refcount == 0) { + writel(0, filter_reg + NV_PCIE_TGT_ADDR_CTRL); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); + + filter->base = 0; + filter->mask = 0; + } + } +} + +static void pcie_tgt_pmu_set_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + bool addr_filter_en; + int idx; + u32 filter2_val, filter2_offset, port_filter; + u64 base, mask; + + filter2_val = 0; + filter2_offset = PMEVFILT2R + (4 * event->hw.idx); + + addr_filter_en = pcie_tgt_pmu_addr_en(event); + if (addr_filter_en) { + base = pcie_tgt_pmu_dst_addr_base(event); + mask = pcie_tgt_pmu_dst_addr_mask(event); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); + + if (idx < 0) { + dev_err(cspmu->dev, + "Unable to find a slot for address filtering\n"); + writel(0, cspmu->base0 + filter2_offset); + return; + } + + /* Configure address range filter registers.*/ + pcie_tgt_pmu_config_addr_filter(cspmu, true, base, mask, idx); + + /* Config the counter to use the selected address filter slot. */ + filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_ADDR, 1U << idx); + } + + port_filter = pcie_tgt_pmu_port_filter(event); + + /* Monitor all ports if no filter is selected. */ + if (!addr_filter_en && port_filter == 0) + port_filter = NV_PCIE_TGT_FILTER2_PORT; + + filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_PORT, port_filter); + + writel(filter2_val, cspmu->base0 + filter2_offset); +} + +static void pcie_tgt_pmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + bool addr_filter_en; + u64 base, mask; + int idx; + + addr_filter_en = pcie_tgt_pmu_addr_en(event); + if (!addr_filter_en) + return; + + base = pcie_tgt_pmu_dst_addr_base(event); + mask = pcie_tgt_pmu_dst_addr_mask(event); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, true); + + if (idx < 0) { + dev_err(cspmu->dev, + "Unable to find the address filter slot to reset\n"); + return; + } + + pcie_tgt_pmu_config_addr_filter(cspmu, false, base, mask, idx); +} + +static u32 pcie_tgt_pmu_event_type(const struct perf_event *event) +{ + return event->attr.config & NV_PCIE_TGT_EV_TYPE_MASK; +} + +static bool pcie_tgt_pmu_is_cycle_counter_event(const struct perf_event *event) +{ + u32 event_type = pcie_tgt_pmu_event_type(event); + + return event_type == NV_PCIE_TGT_EV_TYPE_CC; +} + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, NAME_FMT_SOCKET, @@ -618,6 +915,28 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .reset_ev_filter = nv_cspmu_reset_ev_filter, } }, + { + .prodid = 0x10700000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_tgt_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_tgt_pmu_event_attrs, + .format_attr = pcie_tgt_pmu_format_attrs, + .filter_mask = 0x0, + .filter_default_val = 0x0, + .filter2_mask = NV_PCIE_TGT_FILTER2_MASK, + .filter2_default_val = NV_PCIE_TGT_FILTER2_DEFAULT, + .init_data = pcie_tgt_init_data + }, + .ops = { + .is_cycle_counter_event = pcie_tgt_pmu_is_cycle_counter_event, + .event_type = pcie_tgt_pmu_event_type, + .validate_event = pcie_tgt_pmu_validate_event, + .set_ev_filter = pcie_tgt_pmu_set_ev_filter, + .reset_ev_filter = pcie_tgt_pmu_reset_ev_filter, + } + }, { .prodid = 0, .prodid_mask = 0, @@ -710,6 +1029,8 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) /* NVIDIA specific callbacks. */ SET_OP(validate_event, impl_ops, match, NULL); + SET_OP(event_type, impl_ops, match, NULL); + SET_OP(is_cycle_counter_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); SET_OP(reset_ev_filter, impl_ops, match, NULL); From ae82af010ba6e08f12b6426ff2433d77d098e331 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:50 +0000 Subject: [PATCH 14/17] perf: add NVIDIA Tegra410 CPU Memory Latency PMU Adds CPU Memory (CMEM) Latency PMU support in Tegra410 SOC. The PMU is used to measure latency between the edge of the Unified Coherence Fabric to the local system DRAM. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit 429b7638b2df5538e945aaa2cc189cf0d6e8fb3a) Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 25 + drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/nvidia_t410_cmem_latency_pmu.c | 736 ++++++++++++++++++ 4 files changed, 769 insertions(+) create mode 100644 drivers/perf/nvidia_t410_cmem_latency_pmu.c diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index c065764d41fea..9945c43f6a7a5 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -8,6 +8,7 @@ metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) * PCIE * PCIE-TGT +* CPU Memory (CMEM) Latency PMU Driver ---------- @@ -344,3 +345,27 @@ Example usage: 0x10000 to 0x100FF on socket 0's PCIE RC-1:: perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ + +CPU Memory (CMEM) Latency PMU +----------------------------- + +This PMU monitors latency events of memory read requests from the edge of the +Unified Coherence Fabric (UCF) to local CPU DRAM: + + * RD_REQ counters: count read requests (32B per request). + * RD_CUM_OUTS counters: accumulated outstanding request counter, which track + how many cycles the read requests are in flight. + * CYCLES counter: counts the number of elapsed cycles. + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. + +Example usage:: + + perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 638321fc9800c..26e86067d8f93 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -311,4 +311,11 @@ config MARVELL_PEM_PMU Enable support for PCIe Interface performance monitoring on Marvell platform. +config NVIDIA_TEGRA410_CMEM_LATENCY_PMU + tristate "NVIDIA Tegra410 CPU Memory Latency PMU" + depends on ARM64 && ACPI + help + Enable perf support for CPU memory latency counters monitoring on + NVIDIA Tegra410 SoC. + endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index ea52711a87e32..4aa6aad393c2d 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -35,3 +35,4 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o +obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c new file mode 100644 index 0000000000000..acb8f5571522c --- /dev/null +++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c @@ -0,0 +1,736 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_INSTANCES 14 + +/* Register offsets. */ +#define CMEM_LAT_CG_CTRL 0x800 +#define CMEM_LAT_CTRL 0x808 +#define CMEM_LAT_STATUS 0x810 +#define CMEM_LAT_CYCLE_CNTR 0x818 +#define CMEM_LAT_MC0_REQ_CNTR 0x820 +#define CMEM_LAT_MC0_AOR_CNTR 0x830 +#define CMEM_LAT_MC1_REQ_CNTR 0x838 +#define CMEM_LAT_MC1_AOR_CNTR 0x848 +#define CMEM_LAT_MC2_REQ_CNTR 0x850 +#define CMEM_LAT_MC2_AOR_CNTR 0x860 + +/* CMEM_LAT_CTRL values. */ +#define CMEM_LAT_CTRL_DISABLE 0x0ULL +#define CMEM_LAT_CTRL_ENABLE 0x1ULL +#define CMEM_LAT_CTRL_CLR 0x2ULL + +/* CMEM_LAT_CG_CTRL values. */ +#define CMEM_LAT_CG_CTRL_DISABLE 0x0ULL +#define CMEM_LAT_CG_CTRL_ENABLE 0x1ULL + +/* CMEM_LAT_STATUS register field. */ +#define CMEM_LAT_STATUS_CYCLE_OVF BIT(0) +#define CMEM_LAT_STATUS_MC0_AOR_OVF BIT(1) +#define CMEM_LAT_STATUS_MC0_REQ_OVF BIT(3) +#define CMEM_LAT_STATUS_MC1_AOR_OVF BIT(4) +#define CMEM_LAT_STATUS_MC1_REQ_OVF BIT(6) +#define CMEM_LAT_STATUS_MC2_AOR_OVF BIT(7) +#define CMEM_LAT_STATUS_MC2_REQ_OVF BIT(9) + +/* Events. */ +#define CMEM_LAT_EVENT_CYCLES 0x0 +#define CMEM_LAT_EVENT_REQ 0x1 +#define CMEM_LAT_EVENT_AOR 0x2 + +#define CMEM_LAT_NUM_EVENTS 0x3 +#define CMEM_LAT_MASK_EVENT 0x3 +#define CMEM_LAT_MAX_ACTIVE_EVENTS 32 + +#define CMEM_LAT_ACTIVE_CPU_MASK 0x0 +#define CMEM_LAT_ASSOCIATED_CPU_MASK 0x1 + +static unsigned long cmem_lat_pmu_cpuhp_state; + +struct cmem_lat_pmu_hw_events { + struct perf_event *events[CMEM_LAT_MAX_ACTIVE_EVENTS]; + DECLARE_BITMAP(used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS); +}; + +struct cmem_lat_pmu { + struct pmu pmu; + struct device *dev; + const char *name; + const char *identifier; + void __iomem *base_broadcast; + void __iomem *base[NUM_INSTANCES]; + cpumask_t associated_cpus; + cpumask_t active_cpu; + struct hlist_node node; + struct cmem_lat_pmu_hw_events hw_events; +}; + +#define to_cmem_lat_pmu(p) \ + container_of(p, struct cmem_lat_pmu, pmu) + + +/* Get event type from perf_event. */ +static inline u32 get_event_type(struct perf_event *event) +{ + return (event->attr.config) & CMEM_LAT_MASK_EVENT; +} + +/* PMU operations. */ +static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events, + struct perf_event *event) +{ + unsigned int idx; + + idx = find_first_zero_bit(hw_events->used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS); + if (idx >= CMEM_LAT_MAX_ACTIVE_EVENTS) + return -EAGAIN; + + set_bit(idx, hw_events->used_ctrs); + + return idx; +} + +static bool cmem_lat_pmu_validate_event(struct pmu *pmu, + struct cmem_lat_pmu_hw_events *hw_events, + struct perf_event *event) +{ + int ret; + + if (is_software_event(event)) + return true; + + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + + ret = cmem_lat_pmu_get_event_idx(hw_events, event); + if (ret < 0) + return false; + + return true; +} + +/* Make sure the group of events can be scheduled at once on the PMU. */ +static bool cmem_lat_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cmem_lat_pmu_hw_events fake_hw_events; + + if (event->group_leader == event) + return true; + + memset(&fake_hw_events, 0, sizeof(fake_hw_events)); + + if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader)) + return false; + + for_each_sibling_event(sibling, leader) { + if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, sibling)) + return false; + } + + return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event); +} + +static int cmem_lat_pmu_event_init(struct perf_event *event) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u32 event_type = get_event_type(event); + + if (event->attr.type != event->pmu->type || + event_type >= CMEM_LAT_NUM_EVENTS) + return -ENOENT; + + /* + * Sampling, per-process mode, and per-task counters are not supported + * since this PMU is shared across all CPUs. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(cmem_lat_pmu->pmu.dev, + "Can't support sampling and per-process mode\n"); + return -EOPNOTSUPP; + } + + if (event->cpu < 0) { + dev_dbg(cmem_lat_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + /* + * Make sure the CPU assignment is on one of the CPUs associated with + * this PMU. + */ + if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) { + dev_dbg(cmem_lat_pmu->pmu.dev, + "Requested cpu is not associated with the PMU\n"); + return -EINVAL; + } + + /* Enforce the current active CPU to handle the events in this PMU. */ + event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + + if (!cmem_lat_pmu_validate_group(event)) + return -EINVAL; + + hwc->idx = -1; + hwc->config = event_type; + + return 0; +} + +static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu, + unsigned int inst) +{ + return readq(cmem_lat_pmu->base[inst] + CMEM_LAT_STATUS); +} + +static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event) +{ + const unsigned int instance = 0; + u64 status; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* + * Use the reading from first instance since all instances are + * identical. + */ + status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance); + if (status & CMEM_LAT_STATUS_CYCLE_OVF) + dev_warn(dev, "Cycle counter overflow\n"); + + return readq(cmem_lat_pmu->base[instance] + CMEM_LAT_CYCLE_CNTR); +} + +static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event) +{ + unsigned int i; + u64 status, val = 0; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* Sum up the counts from all instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); + if (status & CMEM_LAT_STATUS_MC0_REQ_OVF) + dev_warn(dev, "MC0 request counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC1_REQ_OVF) + dev_warn(dev, "MC1 request counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC2_REQ_OVF) + dev_warn(dev, "MC2 request counter overflow\n"); + + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_REQ_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_REQ_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_REQ_CNTR); + } + + return val; +} + +static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event) +{ + unsigned int i; + u64 status, val = 0; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* Sum up the counts from all instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); + if (status & CMEM_LAT_STATUS_MC0_AOR_OVF) + dev_warn(dev, "MC0 AOR counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC1_AOR_OVF) + dev_warn(dev, "MC1 AOR counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC2_AOR_OVF) + dev_warn(dev, "MC2 AOR counter overflow\n"); + + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_AOR_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_AOR_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_AOR_CNTR); + } + + return val; +} + +static u64 (*read_counter_fn[CMEM_LAT_NUM_EVENTS])(struct perf_event *) = { + [CMEM_LAT_EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter, + [CMEM_LAT_EVENT_REQ] = cmem_lat_pmu_read_req_counter, + [CMEM_LAT_EVENT_AOR] = cmem_lat_pmu_read_aor_counter, +}; + +static void cmem_lat_pmu_event_update(struct perf_event *event) +{ + u32 event_type; + u64 prev, now; + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_STOPPED) + return; + + event_type = hwc->config; + + do { + prev = local64_read(&hwc->prev_count); + now = read_counter_fn[event_type](event); + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + local64_add(now - prev, &event->count); + + hwc->state |= PERF_HES_UPTODATE; +} + +static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags) +{ + event->hw.state = 0; +} + +static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags) +{ + event->hw.state |= PERF_HES_STOPPED; +} + +static int cmem_lat_pmu_add(struct perf_event *event, int flags) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &cmem_lat_pmu->associated_cpus))) + return -ENOENT; + + idx = cmem_lat_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hw_events->events[idx] = event; + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + cmem_lat_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void cmem_lat_pmu_del(struct perf_event *event, int flags) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + cmem_lat_pmu_stop(event, PERF_EF_UPDATE); + + hw_events->events[idx] = NULL; + + clear_bit(idx, hw_events->used_ctrs); + + perf_event_update_userpage(event); +} + +static void cmem_lat_pmu_read(struct perf_event *event) +{ + cmem_lat_pmu_event_update(event); +} + +static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, + u64 val) +{ + writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CG_CTRL); +} + +static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val) +{ + writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CTRL); +} + +static void cmem_lat_pmu_enable(struct pmu *pmu) +{ + bool disabled; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + + disabled = bitmap_empty(cmem_lat_pmu->hw_events.used_ctrs, + CMEM_LAT_MAX_ACTIVE_EVENTS); + + if (disabled) + return; + + /* Enable all the counters. */ + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE); + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_ENABLE); +} + +static void cmem_lat_pmu_disable(struct pmu *pmu) +{ + int idx; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + + /* Disable all the counters. */ + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_DISABLE); + + /* + * The counters will start from 0 again on restart. + * Update the events immediately to avoid losing the counts. + */ + for_each_set_bit(idx, cmem_lat_pmu->hw_events.used_ctrs, + CMEM_LAT_MAX_ACTIVE_EVENTS) { + struct perf_event *event = cmem_lat_pmu->hw_events.events[idx]; + + if (!event) + continue; + + cmem_lat_pmu_event_update(event); + + local64_set(&event->hw.prev_count, 0ULL); + } + + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR); + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE); +} + +/* PMU identifier attribute. */ + +static ssize_t cmem_lat_pmu_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier); +} + +static struct device_attribute cmem_lat_pmu_identifier_attr = + __ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL); + +static struct attribute *cmem_lat_pmu_identifier_attrs[] = { + &cmem_lat_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group cmem_lat_pmu_identifier_attr_group = { + .attrs = cmem_lat_pmu_identifier_attrs, +}; + +/* Format attributes. */ + +#define NV_PMU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]){ \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +static struct attribute *cmem_lat_pmu_formats[] = { + NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_format_group = { + .name = "format", + .attrs = cmem_lat_pmu_formats, +}; + +/* Event attributes. */ + +static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, typeof(*pmu_attr), attr); + return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); +} + +#define NV_PMU_EVENT_ATTR(_name, _config) \ + PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config) + +static struct attribute *cmem_lat_pmu_events[] = { + NV_PMU_EVENT_ATTR(cycles, CMEM_LAT_EVENT_CYCLES), + NV_PMU_EVENT_ATTR(rd_req, CMEM_LAT_EVENT_REQ), + NV_PMU_EVENT_ATTR(rd_cum_outs, CMEM_LAT_EVENT_AOR), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_events_group = { + .name = "events", + .attrs = cmem_lat_pmu_events, +}; + +/* Cpumask attributes. */ + +static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + struct dev_ext_attribute *eattr = + container_of(attr, struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case CMEM_LAT_ACTIVE_CPU_MASK: + cpumask = &cmem_lat_pmu->active_cpu; + break; + case CMEM_LAT_ASSOCIATED_CPU_MASK: + cpumask = &cmem_lat_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +#define NV_PMU_CPUMASK_ATTR(_name, _config) \ + NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show, \ + (unsigned long)_config) + +static struct attribute *cmem_lat_pmu_cpumask_attrs[] = { + NV_PMU_CPUMASK_ATTR(cpumask, CMEM_LAT_ACTIVE_CPU_MASK), + NV_PMU_CPUMASK_ATTR(associated_cpus, CMEM_LAT_ASSOCIATED_CPU_MASK), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = { + .attrs = cmem_lat_pmu_cpumask_attrs, +}; + +/* Per PMU device attribute groups. */ + +static const struct attribute_group *cmem_lat_pmu_attr_groups[] = { + &cmem_lat_pmu_identifier_attr_group, + &cmem_lat_pmu_format_group, + &cmem_lat_pmu_events_group, + &cmem_lat_pmu_cpumask_attr_group, + NULL +}; + +static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct cmem_lat_pmu *cmem_lat_pmu = + hlist_entry_safe(node, struct cmem_lat_pmu, node); + + if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&cmem_lat_pmu->active_cpu)) + return 0; + + /* Use this CPU for event counting */ + cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu); + + return 0; +} + +static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + unsigned int dst; + + struct cmem_lat_pmu *cmem_lat_pmu = + hlist_entry_safe(node, struct cmem_lat_pmu, node); + + /* Nothing to do if this CPU doesn't own the PMU */ + if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu)) + return 0; + + /* Choose a new CPU to migrate ownership of the PMU to */ + dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus, + cpu_online_mask, cpu); + if (dst >= nr_cpu_ids) + return 0; + + /* Use this CPU for event counting */ + perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst); + cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu); + + return 0; +} + +static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu, + unsigned int socket) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_node(cpu) == socket) + cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus); + } + + if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) { + dev_dbg(cmem_lat_pmu->dev, + "No cpu associated with PMU socket-%u\n", socket); + return -ENODEV; + } + + return 0; +} + +static int cmem_lat_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct acpi_device *acpi_dev; + struct cmem_lat_pmu *cmem_lat_pmu; + char *name, *uid_str; + int ret, i; + u32 socket; + + acpi_dev = ACPI_COMPANION(dev); + if (!acpi_dev) + return -ENODEV; + + uid_str = acpi_device_uid(acpi_dev); + if (!uid_str) + return -ENODEV; + + ret = kstrtou32(uid_str, 0, &socket); + if (ret) + return ret; + + cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL); + name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket); + if (!cmem_lat_pmu || !name) + return -ENOMEM; + + cmem_lat_pmu->dev = dev; + cmem_lat_pmu->name = name; + cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev); + platform_set_drvdata(pdev, cmem_lat_pmu); + + cmem_lat_pmu->pmu = (struct pmu) { + .parent = &pdev->dev, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = cmem_lat_pmu_enable, + .pmu_disable = cmem_lat_pmu_disable, + .event_init = cmem_lat_pmu_event_init, + .add = cmem_lat_pmu_add, + .del = cmem_lat_pmu_del, + .start = cmem_lat_pmu_start, + .stop = cmem_lat_pmu_stop, + .read = cmem_lat_pmu_read, + .attr_groups = cmem_lat_pmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | + PERF_PMU_CAP_NO_INTERRUPT, + }; + + /* Map the address of all the instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); + if (IS_ERR(cmem_lat_pmu->base[i])) { + dev_err(dev, "Failed map address for instance %d\n", i); + return PTR_ERR(cmem_lat_pmu->base[i]); + } + } + + /* Map broadcast address. */ + cmem_lat_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, + NUM_INSTANCES); + if (IS_ERR(cmem_lat_pmu->base_broadcast)) { + dev_err(dev, "Failed map broadcast address\n"); + return PTR_ERR(cmem_lat_pmu->base_broadcast); + } + + ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); + if (ret) { + dev_err(&pdev->dev, "Error %d registering hotplug\n", ret); + return ret; + } + + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE); + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR); + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE); + + ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1); + if (ret) { + dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret); + cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); + return ret; + } + + dev_dbg(&pdev->dev, "Registered %s PMU\n", name); + + return 0; +} + +static void cmem_lat_pmu_device_remove(struct platform_device *pdev) +{ + struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&cmem_lat_pmu->pmu); + cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); +} + +static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = { + { "NVDA2021" }, + { } +}; +MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match); + +static struct platform_driver cmem_lat_pmu_driver = { + .driver = { + .name = "nvidia-t410-cmem-latency-pmu", + .acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match), + .suppress_bind_attrs = true, + }, + .probe = cmem_lat_pmu_probe, + .remove = cmem_lat_pmu_device_remove, +}; + +static int __init cmem_lat_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/nvidia/cmem_latency:online", + cmem_lat_pmu_cpu_online, + cmem_lat_pmu_cpu_teardown); + if (ret < 0) + return ret; + + cmem_lat_pmu_cpuhp_state = ret; + + return platform_driver_register(&cmem_lat_pmu_driver); +} + +static void __exit cmem_lat_pmu_exit(void) +{ + platform_driver_unregister(&cmem_lat_pmu_driver); + cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state); +} + +module_init(cmem_lat_pmu_init); +module_exit(cmem_lat_pmu_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver"); +MODULE_AUTHOR("Besar Wicaksono "); From 526e58071c4178841582ba72cc2a65f5740f5a57 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:51 +0000 Subject: [PATCH 15/17] perf: add NVIDIA Tegra410 C2C PMU Adds NVIDIA C2C PMU support in Tegra410 SOC. This PMU is used to measure memory latency between the SOC and device memory, e.g GPU Memory (GMEM), CXL Memory, or memory on remote Tegra410 SOC. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit 2f89b7f78c50ca973ca035ceb30426f78d9e0996) Signed-off-by: Matthew R. Ochs --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 151 +++ drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/nvidia_t410_c2c_pmu.c | 1051 +++++++++++++++++ 4 files changed, 1210 insertions(+) create mode 100644 drivers/perf/nvidia_t410_c2c_pmu.c diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 9945c43f6a7a5..0656223b61d47 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -9,6 +9,9 @@ metrics like memory bandwidth, latency, and utilization: * PCIE * PCIE-TGT * CPU Memory (CMEM) Latency +* NVLink-C2C +* NV-CLink +* NV-DLink PMU Driver ---------- @@ -369,3 +372,151 @@ see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. Example usage:: perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' + +NVLink-C2C PMU +-------------- + +This PMU monitors latency events of memory read/write requests that pass through +the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available +in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC). + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. + * IN_RD_REQ: the number of incoming read requests. + * IN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests. + * IN_WR_REQ: the number of incoming write requests. + * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. + * OUT_RD_REQ: the number of outgoing read requests. + * OUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests. + * OUT_WR_REQ: the number of outgoing write requests. + * CYCLES: NVLink-C2C interface cycle counts. + +The incoming events count the reads/writes from remote device to the SoC. +The outgoing events count the reads/writes from the SoC to remote device. + +The sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_/peer +contains the information about the connected device. + +When the C2C interface is connected to GPU(s), the user can use the +"gpu_mask" parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU +index, e.g. "gpu_mask=0x1" corresponds to GPU 0 and "gpu_mask=0x3" is for GPU 0 and 1. +The PMU will monitor all GPUs by default if not specified. + +When connected to another SoC, only the read events are available. + +The events can be used to calculate the average latency of the read/write requests:: + + C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ + IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ + OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ + OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + +Example usage: + + * Count incoming traffic from all GPUs connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/ + + * Count incoming traffic from GPU 0 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/ + + * Count incoming traffic from GPU 1 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/ + + * Count outgoing traffic to all GPUs connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/ + + * Count outgoing traffic to GPU 0 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/ + + * Count outgoing traffic to GPU 1 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/ + +NV-CLink PMU +------------ + +This PMU monitors latency events of memory read requests that pass through +the NV-CLINK interface. Bandwidth events are not available in this PMU. +In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410 +SoC and this PMU only counts read traffic. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvclink_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. + * IN_RD_REQ: the number of incoming read requests. + * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. + * OUT_RD_REQ: the number of outgoing read requests. + * CYCLES: NV-CLINK interface cycle counts. + +The incoming events count the reads from remote device to the SoC. +The outgoing events count the reads from the SoC to remote device. + +The events can be used to calculate the average latency of the read requests:: + + CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ + + OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ + OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ + +Example usage: + + * Count incoming read traffic from remote SoC connected via NV-CLINK:: + + perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/ + + * Count outgoing read traffic to remote SoC connected via NV-CLINK:: + + perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/ + +NV-DLink PMU +------------ + +This PMU monitors latency events of memory read requests that pass through +the NV-DLINK interface. Bandwidth events are not available in this PMU. +In Tegra410 SoC, this PMU only counts CXL memory read traffic. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory. + * IN_RD_REQ: the number of read requests to CXL memory. + * CYCLES: NV-DLINK interface cycle counts. + +The events can be used to calculate the average latency of the read requests:: + + DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ + +Example usage: + + * Count read events to CXL memory:: + + perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}' diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 26e86067d8f93..ab90932fc2d01 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -318,4 +318,11 @@ config NVIDIA_TEGRA410_CMEM_LATENCY_PMU Enable perf support for CPU memory latency counters monitoring on NVIDIA Tegra410 SoC. +config NVIDIA_TEGRA410_C2C_PMU + tristate "NVIDIA Tegra410 C2C PMU" + depends on ARM64 && ACPI + help + Enable perf support for counters in NVIDIA C2C interface of NVIDIA + Tegra410 SoC. + endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 4aa6aad393c2d..eb8a022dad9a7 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o +obj-$(CONFIG_NVIDIA_TEGRA410_C2C_PMU) += nvidia_t410_c2c_pmu.o diff --git a/drivers/perf/nvidia_t410_c2c_pmu.c b/drivers/perf/nvidia_t410_c2c_pmu.c new file mode 100644 index 0000000000000..411987153ff3f --- /dev/null +++ b/drivers/perf/nvidia_t410_c2c_pmu.c @@ -0,0 +1,1051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVIDIA Tegra410 C2C PMU driver. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The C2C interface types in Tegra410. */ +#define C2C_TYPE_NVLINK 0x0 +#define C2C_TYPE_NVCLINK 0x1 +#define C2C_TYPE_NVDLINK 0x2 +#define C2C_TYPE_COUNT 0x3 + +/* The type of the peer device connected to the C2C interface. */ +#define C2C_PEER_TYPE_CPU 0x0 +#define C2C_PEER_TYPE_GPU 0x1 +#define C2C_PEER_TYPE_CXLMEM 0x2 +#define C2C_PEER_TYPE_COUNT 0x3 + +/* The number of peer devices can be connected to the C2C interface. */ +#define C2C_NR_PEER_CPU 0x1 +#define C2C_NR_PEER_GPU 0x2 +#define C2C_NR_PEER_CXLMEM 0x1 +#define C2C_NR_PEER_MAX 0x2 + +/* Number of instances on each interface. */ +#define C2C_NR_INST_NVLINK 14 +#define C2C_NR_INST_NVCLINK 12 +#define C2C_NR_INST_NVDLINK 16 +#define C2C_NR_INST_MAX 16 + +/* Register offsets. */ +#define C2C_CTRL 0x864 +#define C2C_IN_STATUS 0x868 +#define C2C_CYCLE_CNTR 0x86c +#define C2C_IN_RD_CUM_OUTS_CNTR 0x874 +#define C2C_IN_RD_REQ_CNTR 0x87c +#define C2C_IN_WR_CUM_OUTS_CNTR 0x884 +#define C2C_IN_WR_REQ_CNTR 0x88c +#define C2C_OUT_STATUS 0x890 +#define C2C_OUT_RD_CUM_OUTS_CNTR 0x898 +#define C2C_OUT_RD_REQ_CNTR 0x8a0 +#define C2C_OUT_WR_CUM_OUTS_CNTR 0x8a8 +#define C2C_OUT_WR_REQ_CNTR 0x8b0 + +/* C2C_IN_STATUS register field. */ +#define C2C_IN_STATUS_CYCLE_OVF BIT(0) +#define C2C_IN_STATUS_IN_RD_CUM_OUTS_OVF BIT(1) +#define C2C_IN_STATUS_IN_RD_REQ_OVF BIT(2) +#define C2C_IN_STATUS_IN_WR_CUM_OUTS_OVF BIT(3) +#define C2C_IN_STATUS_IN_WR_REQ_OVF BIT(4) + +/* C2C_OUT_STATUS register field. */ +#define C2C_OUT_STATUS_OUT_RD_CUM_OUTS_OVF BIT(0) +#define C2C_OUT_STATUS_OUT_RD_REQ_OVF BIT(1) +#define C2C_OUT_STATUS_OUT_WR_CUM_OUTS_OVF BIT(2) +#define C2C_OUT_STATUS_OUT_WR_REQ_OVF BIT(3) + +/* Events. */ +#define C2C_EVENT_CYCLES 0x0 +#define C2C_EVENT_IN_RD_CUM_OUTS 0x1 +#define C2C_EVENT_IN_RD_REQ 0x2 +#define C2C_EVENT_IN_WR_CUM_OUTS 0x3 +#define C2C_EVENT_IN_WR_REQ 0x4 +#define C2C_EVENT_OUT_RD_CUM_OUTS 0x5 +#define C2C_EVENT_OUT_RD_REQ 0x6 +#define C2C_EVENT_OUT_WR_CUM_OUTS 0x7 +#define C2C_EVENT_OUT_WR_REQ 0x8 + +#define C2C_NUM_EVENTS 0x9 +#define C2C_MASK_EVENT 0xFF +#define C2C_MAX_ACTIVE_EVENTS 32 + +#define C2C_ACTIVE_CPU_MASK 0x0 +#define C2C_ASSOCIATED_CPU_MASK 0x1 + +/* + * Maximum poll count for reading counter value using high-low-high sequence. + */ +#define HILOHI_MAX_POLL 1000 + +static unsigned long nv_c2c_pmu_cpuhp_state; + +/* PMU descriptor. */ + +/* C2C type information. */ +struct nv_c2c_pmu_data { + unsigned int c2c_type; + unsigned int nr_inst; + const char *name_fmt; +}; + +static const struct nv_c2c_pmu_data nv_c2c_pmu_data[] = { + [C2C_TYPE_NVLINK] = { + .c2c_type = C2C_TYPE_NVLINK, + .nr_inst = C2C_NR_INST_NVLINK, + .name_fmt = "nvidia_nvlink_c2c_pmu_%u", + }, + [C2C_TYPE_NVCLINK] = { + .c2c_type = C2C_TYPE_NVCLINK, + .nr_inst = C2C_NR_INST_NVCLINK, + .name_fmt = "nvidia_nvclink_pmu_%u", + }, + [C2C_TYPE_NVDLINK] = { + .c2c_type = C2C_TYPE_NVDLINK, + .nr_inst = C2C_NR_INST_NVDLINK, + .name_fmt = "nvidia_nvdlink_pmu_%u", + }, +}; + +/* Tracks the events assigned to the PMU for a given logical index. */ +struct nv_c2c_pmu_hw_events { + /* The events that are active. */ + struct perf_event *events[C2C_MAX_ACTIVE_EVENTS]; + + /* + * Each bit indicates a logical counter is being used (or not) for an + * event. + */ + DECLARE_BITMAP(used_ctrs, C2C_MAX_ACTIVE_EVENTS); +}; + +struct nv_c2c_pmu { + struct pmu pmu; + struct device *dev; + struct acpi_device *acpi_dev; + + const char *name; + const char *identifier; + + const struct nv_c2c_pmu_data *data; + unsigned int peer_type; + unsigned int socket; + unsigned int nr_peer; + unsigned long peer_insts[C2C_NR_PEER_MAX][BITS_TO_LONGS(C2C_NR_INST_MAX)]; + u32 filter_default; + + struct nv_c2c_pmu_hw_events hw_events; + + cpumask_t associated_cpus; + cpumask_t active_cpu; + + struct hlist_node cpuhp_node; + + const struct attribute_group **attr_groups; + + void __iomem *base_broadcast; + void __iomem *base[C2C_NR_INST_MAX]; +}; + +#define to_c2c_pmu(p) (container_of(p, struct nv_c2c_pmu, pmu)) + +/* Get event type from perf_event. */ +static inline u32 get_event_type(struct perf_event *event) +{ + return (event->attr.config) & C2C_MASK_EVENT; +} + +static inline u32 get_filter_mask(struct perf_event *event) +{ + u32 filter; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + + filter = ((u32)event->attr.config1) & c2c_pmu->filter_default; + if (filter == 0) + filter = c2c_pmu->filter_default; + + return filter; +} + +/* PMU operations. */ + +static int nv_c2c_pmu_get_event_idx(struct nv_c2c_pmu_hw_events *hw_events, + struct perf_event *event) +{ + u32 idx; + + idx = find_first_zero_bit(hw_events->used_ctrs, C2C_MAX_ACTIVE_EVENTS); + if (idx >= C2C_MAX_ACTIVE_EVENTS) + return -EAGAIN; + + set_bit(idx, hw_events->used_ctrs); + + return idx; +} + +static bool +nv_c2c_pmu_validate_event(struct pmu *pmu, + struct nv_c2c_pmu_hw_events *hw_events, + struct perf_event *event) +{ + if (is_software_event(event)) + return true; + + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + + return nv_c2c_pmu_get_event_idx(hw_events, event) >= 0; +} + +/* + * Make sure the group of events can be scheduled at once + * on the PMU. + */ +static bool nv_c2c_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct nv_c2c_pmu_hw_events fake_hw_events; + + if (event->group_leader == event) + return true; + + memset(&fake_hw_events, 0, sizeof(fake_hw_events)); + + if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, leader)) + return false; + + for_each_sibling_event(sibling, leader) { + if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, + sibling)) + return false; + } + + return nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, event); +} + +static int nv_c2c_pmu_event_init(struct perf_event *event) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u32 event_type = get_event_type(event); + + if (event->attr.type != event->pmu->type || + event_type >= C2C_NUM_EVENTS) + return -ENOENT; + + /* + * Following other "uncore" PMUs, we do not support sampling mode or + * attach to a task (per-process mode). + */ + if (is_sampling_event(event)) { + dev_dbg(c2c_pmu->pmu.dev, "Can't support sampling events\n"); + return -EOPNOTSUPP; + } + + if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(c2c_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + /* + * Make sure the CPU assignment is on one of the CPUs associated with + * this PMU. + */ + if (!cpumask_test_cpu(event->cpu, &c2c_pmu->associated_cpus)) { + dev_dbg(c2c_pmu->pmu.dev, + "Requested cpu is not associated with the PMU\n"); + return -EINVAL; + } + + /* Enforce the current active CPU to handle the events in this PMU. */ + event->cpu = cpumask_first(&c2c_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + + if (!nv_c2c_pmu_validate_group(event)) + return -EINVAL; + + hwc->idx = -1; + hwc->config = event_type; + + return 0; +} + +/* + * Read 64-bit register as a pair of 32-bit registers using hi-lo-hi sequence. + */ +static u64 read_reg64_hilohi(const void __iomem *addr, u32 max_poll_count) +{ + u32 val_lo, val_hi; + u64 val; + + /* Use high-low-high sequence to avoid tearing */ + do { + if (max_poll_count-- == 0) { + pr_err("NV C2C PMU: timeout hi-low-high sequence\n"); + return 0; + } + + val_hi = readl(addr + 4); + val_lo = readl(addr); + } while (val_hi != readl(addr + 4)); + + val = (((u64)val_hi << 32) | val_lo); + + return val; +} + +static void nv_c2c_pmu_check_status(struct nv_c2c_pmu *c2c_pmu, u32 instance) +{ + u32 in_status, out_status; + + in_status = readl(c2c_pmu->base[instance] + C2C_IN_STATUS); + out_status = readl(c2c_pmu->base[instance] + C2C_OUT_STATUS); + + if (in_status || out_status) + dev_warn(c2c_pmu->dev, + "C2C PMU overflow in: 0x%x, out: 0x%x\n", + in_status, out_status); +} + +static u32 nv_c2c_ctr_offset[C2C_NUM_EVENTS] = { + [C2C_EVENT_CYCLES] = C2C_CYCLE_CNTR, + [C2C_EVENT_IN_RD_CUM_OUTS] = C2C_IN_RD_CUM_OUTS_CNTR, + [C2C_EVENT_IN_RD_REQ] = C2C_IN_RD_REQ_CNTR, + [C2C_EVENT_IN_WR_CUM_OUTS] = C2C_IN_WR_CUM_OUTS_CNTR, + [C2C_EVENT_IN_WR_REQ] = C2C_IN_WR_REQ_CNTR, + [C2C_EVENT_OUT_RD_CUM_OUTS] = C2C_OUT_RD_CUM_OUTS_CNTR, + [C2C_EVENT_OUT_RD_REQ] = C2C_OUT_RD_REQ_CNTR, + [C2C_EVENT_OUT_WR_CUM_OUTS] = C2C_OUT_WR_CUM_OUTS_CNTR, + [C2C_EVENT_OUT_WR_REQ] = C2C_OUT_WR_REQ_CNTR, +}; + +static u64 nv_c2c_pmu_read_counter(struct perf_event *event) +{ + u32 ctr_id, ctr_offset, filter_mask, filter_idx, inst_idx; + unsigned long *inst_mask; + DECLARE_BITMAP(filter_bitmap, C2C_NR_PEER_MAX); + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + u64 val = 0; + + filter_mask = get_filter_mask(event); + bitmap_from_arr32(filter_bitmap, &filter_mask, c2c_pmu->nr_peer); + + ctr_id = event->hw.config; + ctr_offset = nv_c2c_ctr_offset[ctr_id]; + + for_each_set_bit(filter_idx, filter_bitmap, c2c_pmu->nr_peer) { + inst_mask = c2c_pmu->peer_insts[filter_idx]; + for_each_set_bit(inst_idx, inst_mask, c2c_pmu->data->nr_inst) { + nv_c2c_pmu_check_status(c2c_pmu, inst_idx); + + /* + * Each instance share same clock and the driver always + * enables all instances. So we can use the counts from + * one instance for cycle counter. + */ + if (ctr_id == C2C_EVENT_CYCLES) + return read_reg64_hilohi( + c2c_pmu->base[inst_idx] + ctr_offset, + HILOHI_MAX_POLL); + + /* + * For other events, sum up the counts from all instances. + */ + val += read_reg64_hilohi( + c2c_pmu->base[inst_idx] + ctr_offset, + HILOHI_MAX_POLL); + } + } + + return val; +} + +static void nv_c2c_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, now; + + do { + prev = local64_read(&hwc->prev_count); + now = nv_c2c_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + local64_add(now - prev, &event->count); +} + +static void nv_c2c_pmu_start(struct perf_event *event, int pmu_flags) +{ + event->hw.state = 0; +} + +static void nv_c2c_pmu_stop(struct perf_event *event, int pmu_flags) +{ + event->hw.state |= PERF_HES_STOPPED; +} + +static int nv_c2c_pmu_add(struct perf_event *event, int flags) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &c2c_pmu->associated_cpus))) + return -ENOENT; + + idx = nv_c2c_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hw_events->events[idx] = event; + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + nv_c2c_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void nv_c2c_pmu_del(struct perf_event *event, int flags) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + nv_c2c_pmu_stop(event, PERF_EF_UPDATE); + + hw_events->events[idx] = NULL; + + clear_bit(idx, hw_events->used_ctrs); + + perf_event_update_userpage(event); +} + +static void nv_c2c_pmu_read(struct perf_event *event) +{ + nv_c2c_pmu_event_update(event); +} + +static void nv_c2c_pmu_enable(struct pmu *pmu) +{ + void __iomem *bcast; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + + /* Check if any filter is enabled. */ + if (bitmap_empty(c2c_pmu->hw_events.used_ctrs, C2C_MAX_ACTIVE_EVENTS)) + return; + + /* Enable all the counters. */ + bcast = c2c_pmu->base_broadcast; + writel(0x1UL, bcast + C2C_CTRL); +} + +static void nv_c2c_pmu_disable(struct pmu *pmu) +{ + unsigned int idx; + void __iomem *bcast; + struct perf_event *event; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + + /* Disable all the counters. */ + bcast = c2c_pmu->base_broadcast; + writel(0x0UL, bcast + C2C_CTRL); + + /* + * The counters will start from 0 again on restart. + * Update the events immediately to avoid losing the counts. + */ + for_each_set_bit(idx, c2c_pmu->hw_events.used_ctrs, + C2C_MAX_ACTIVE_EVENTS) { + event = c2c_pmu->hw_events.events[idx]; + + if (!event) + continue; + + nv_c2c_pmu_event_update(event); + + local64_set(&event->hw.prev_count, 0ULL); + } +} + +/* PMU identifier attribute. */ + +static ssize_t nv_c2c_pmu_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(page, "%s\n", c2c_pmu->identifier); +} + +static struct device_attribute nv_c2c_pmu_identifier_attr = + __ATTR(identifier, 0444, nv_c2c_pmu_identifier_show, NULL); + +static struct attribute *nv_c2c_pmu_identifier_attrs[] = { + &nv_c2c_pmu_identifier_attr.attr, + NULL, +}; + +static struct attribute_group nv_c2c_pmu_identifier_attr_group = { + .attrs = nv_c2c_pmu_identifier_attrs, +}; + +/* Peer attribute. */ + +static ssize_t nv_c2c_pmu_peer_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + const char *peer_type[C2C_PEER_TYPE_COUNT] = { + [C2C_PEER_TYPE_CPU] = "cpu", + [C2C_PEER_TYPE_GPU] = "gpu", + [C2C_PEER_TYPE_CXLMEM] = "cxlmem", + }; + + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); + return sysfs_emit(page, "nr_%s=%u\n", peer_type[c2c_pmu->peer_type], + c2c_pmu->nr_peer); +} + +static struct device_attribute nv_c2c_pmu_peer_attr = + __ATTR(peer, 0444, nv_c2c_pmu_peer_show, NULL); + +static struct attribute *nv_c2c_pmu_peer_attrs[] = { + &nv_c2c_pmu_peer_attr.attr, + NULL, +}; + +static struct attribute_group nv_c2c_pmu_peer_attr_group = { + .attrs = nv_c2c_pmu_peer_attrs, +}; + +/* Format attributes. */ + +#define NV_C2C_PMU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]){ \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +#define NV_C2C_PMU_FORMAT_ATTR(_name, _config) \ + NV_C2C_PMU_EXT_ATTR(_name, device_show_string, _config) + +#define NV_C2C_PMU_FORMAT_EVENT_ATTR \ + NV_C2C_PMU_FORMAT_ATTR(event, "config:0-3") + +static struct attribute *nv_c2c_pmu_gpu_formats[] = { + NV_C2C_PMU_FORMAT_EVENT_ATTR, + NV_C2C_PMU_FORMAT_ATTR(gpu_mask, "config1:0-1"), + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_gpu_format_group = { + .name = "format", + .attrs = nv_c2c_pmu_gpu_formats, +}; + +static struct attribute *nv_c2c_pmu_formats[] = { + NV_C2C_PMU_FORMAT_EVENT_ATTR, + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_format_group = { + .name = "format", + .attrs = nv_c2c_pmu_formats, +}; + +/* Event attributes. */ + +static ssize_t nv_c2c_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, typeof(*pmu_attr), attr); + return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); +} + +#define NV_C2C_PMU_EVENT_ATTR(_name, _config) \ + PMU_EVENT_ATTR_ID(_name, nv_c2c_pmu_sysfs_event_show, _config) + +static struct attribute *nv_c2c_pmu_gpu_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(in_wr_cum_outs, C2C_EVENT_IN_WR_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_wr_req, C2C_EVENT_IN_WR_REQ), + NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(out_wr_cum_outs, C2C_EVENT_OUT_WR_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_wr_req, C2C_EVENT_OUT_WR_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_gpu_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_gpu_events, +}; + +static struct attribute *nv_c2c_pmu_cpu_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_cpu_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_cpu_events, +}; + +static struct attribute *nv_c2c_pmu_cxlmem_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_cxlmem_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_cxlmem_events, +}; + +/* Cpumask attributes. */ + +static ssize_t nv_c2c_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + struct dev_ext_attribute *eattr = + container_of(attr, struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case C2C_ACTIVE_CPU_MASK: + cpumask = &c2c_pmu->active_cpu; + break; + case C2C_ASSOCIATED_CPU_MASK: + cpumask = &c2c_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +#define NV_C2C_PMU_CPUMASK_ATTR(_name, _config) \ + NV_C2C_PMU_EXT_ATTR(_name, nv_c2c_pmu_cpumask_show, \ + (unsigned long)_config) + +static struct attribute *nv_c2c_pmu_cpumask_attrs[] = { + NV_C2C_PMU_CPUMASK_ATTR(cpumask, C2C_ACTIVE_CPU_MASK), + NV_C2C_PMU_CPUMASK_ATTR(associated_cpus, C2C_ASSOCIATED_CPU_MASK), + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_cpumask_attr_group = { + .attrs = nv_c2c_pmu_cpumask_attrs, +}; + +/* Attribute groups for C2C PMU connecting SoC and GPU */ +static const struct attribute_group *nv_c2c_pmu_gpu_attr_groups[] = { + &nv_c2c_pmu_gpu_format_group, + &nv_c2c_pmu_gpu_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +/* Attribute groups for C2C PMU connecting multiple SoCs */ +static const struct attribute_group *nv_c2c_pmu_cpu_attr_groups[] = { + &nv_c2c_pmu_format_group, + &nv_c2c_pmu_cpu_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +/* Attribute groups for C2C PMU connecting SoC and CXLMEM */ +static const struct attribute_group *nv_c2c_pmu_cxlmem_attr_groups[] = { + &nv_c2c_pmu_format_group, + &nv_c2c_pmu_cxlmem_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +static int nv_c2c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct nv_c2c_pmu *c2c_pmu = + hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); + + if (!cpumask_test_cpu(cpu, &c2c_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&c2c_pmu->active_cpu)) + return 0; + + /* Use this CPU for event counting */ + cpumask_set_cpu(cpu, &c2c_pmu->active_cpu); + + return 0; +} + +static int nv_c2c_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + unsigned int dst; + + struct nv_c2c_pmu *c2c_pmu = + hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); + + /* Nothing to do if this CPU doesn't own the PMU */ + if (!cpumask_test_and_clear_cpu(cpu, &c2c_pmu->active_cpu)) + return 0; + + /* Choose a new CPU to migrate ownership of the PMU to */ + dst = cpumask_any_and_but(&c2c_pmu->associated_cpus, + cpu_online_mask, cpu); + if (dst >= nr_cpu_ids) + return 0; + + /* Use this CPU for event counting */ + perf_pmu_migrate_context(&c2c_pmu->pmu, cpu, dst); + cpumask_set_cpu(dst, &c2c_pmu->active_cpu); + + return 0; +} + +static int nv_c2c_pmu_get_cpus(struct nv_c2c_pmu *c2c_pmu) +{ + int socket = c2c_pmu->socket, cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_node(cpu) == socket) + cpumask_set_cpu(cpu, &c2c_pmu->associated_cpus); + } + + if (cpumask_empty(&c2c_pmu->associated_cpus)) { + dev_dbg(c2c_pmu->dev, + "No cpu associated with C2C PMU socket-%u\n", socket); + return -ENODEV; + } + + return 0; +} + +static int nv_c2c_pmu_init_socket(struct nv_c2c_pmu *c2c_pmu) +{ + const char *uid_str; + int ret, socket; + + uid_str = acpi_device_uid(c2c_pmu->acpi_dev); + if (!uid_str) { + dev_err(c2c_pmu->dev, "No ACPI device UID\n"); + return -ENODEV; + } + + ret = kstrtou32(uid_str, 0, &socket); + if (ret) { + dev_err(c2c_pmu->dev, "Failed to parse ACPI device UID\n"); + return ret; + } + + c2c_pmu->socket = socket; + return 0; +} + +static int nv_c2c_pmu_init_id(struct nv_c2c_pmu *c2c_pmu) +{ + char *name; + + name = devm_kasprintf(c2c_pmu->dev, GFP_KERNEL, c2c_pmu->data->name_fmt, + c2c_pmu->socket); + if (!name) + return -ENOMEM; + + c2c_pmu->name = name; + + c2c_pmu->identifier = acpi_device_hid(c2c_pmu->acpi_dev); + + return 0; +} + +static int nv_c2c_pmu_init_filter(struct nv_c2c_pmu *c2c_pmu) +{ + u32 cpu_en = 0; + struct device *dev = c2c_pmu->dev; + const struct nv_c2c_pmu_data *data = c2c_pmu->data; + + if (data->c2c_type == C2C_TYPE_NVDLINK) { + c2c_pmu->peer_type = C2C_PEER_TYPE_CXLMEM; + + c2c_pmu->peer_insts[0][0] = (1UL << data->nr_inst) - 1; + + c2c_pmu->nr_peer = C2C_NR_PEER_CXLMEM; + c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; + + c2c_pmu->attr_groups = nv_c2c_pmu_cxlmem_attr_groups; + + return 0; + } + + if (device_property_read_u32(dev, "cpu_en_mask", &cpu_en)) + dev_dbg(dev, "no cpu_en_mask property\n"); + + if (cpu_en) { + c2c_pmu->peer_type = C2C_PEER_TYPE_CPU; + + /* Fill peer_insts bitmap with instances connected to peer CPU. */ + bitmap_from_arr32(c2c_pmu->peer_insts[0], &cpu_en, data->nr_inst); + + c2c_pmu->nr_peer = 1; + c2c_pmu->attr_groups = nv_c2c_pmu_cpu_attr_groups; + } else { + u32 i; + const char *props[C2C_NR_PEER_MAX] = { + "gpu0_en_mask", "gpu1_en_mask" + }; + + for (i = 0; i < C2C_NR_PEER_MAX; i++) { + u32 gpu_en = 0; + + if (device_property_read_u32(dev, props[i], &gpu_en)) + dev_dbg(dev, "no %s property\n", props[i]); + + if (gpu_en) { + /* Fill peer_insts bitmap with instances connected to peer GPU. */ + bitmap_from_arr32(c2c_pmu->peer_insts[i], &gpu_en, + data->nr_inst); + + c2c_pmu->nr_peer++; + } + } + + if (c2c_pmu->nr_peer == 0) { + dev_err(dev, "No GPU is enabled\n"); + return -EINVAL; + } + + c2c_pmu->peer_type = C2C_PEER_TYPE_GPU; + c2c_pmu->attr_groups = nv_c2c_pmu_gpu_attr_groups; + } + + c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; + + return 0; +} + +static void *nv_c2c_pmu_init_pmu(struct platform_device *pdev) +{ + int ret; + struct nv_c2c_pmu *c2c_pmu; + struct acpi_device *acpi_dev; + struct device *dev = &pdev->dev; + + acpi_dev = ACPI_COMPANION(dev); + if (!acpi_dev) + return ERR_PTR(-ENODEV); + + c2c_pmu = devm_kzalloc(dev, sizeof(*c2c_pmu), GFP_KERNEL); + if (!c2c_pmu) + return ERR_PTR(-ENOMEM); + + c2c_pmu->dev = dev; + c2c_pmu->acpi_dev = acpi_dev; + c2c_pmu->data = (const struct nv_c2c_pmu_data *)device_get_match_data(dev); + if (!c2c_pmu->data) + return ERR_PTR(-EINVAL); + + platform_set_drvdata(pdev, c2c_pmu); + + ret = nv_c2c_pmu_init_socket(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + ret = nv_c2c_pmu_init_id(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + ret = nv_c2c_pmu_init_filter(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + return c2c_pmu; +} + +static int nv_c2c_pmu_init_mmio(struct nv_c2c_pmu *c2c_pmu) +{ + int i; + struct device *dev = c2c_pmu->dev; + struct platform_device *pdev = to_platform_device(dev); + const struct nv_c2c_pmu_data *data = c2c_pmu->data; + + /* Map the address of all the instances. */ + for (i = 0; i < data->nr_inst; i++) { + c2c_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); + if (IS_ERR(c2c_pmu->base[i])) { + dev_err(dev, "Failed map address for instance %d\n", i); + return PTR_ERR(c2c_pmu->base[i]); + } + } + + /* Map broadcast address. */ + c2c_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, + data->nr_inst); + if (IS_ERR(c2c_pmu->base_broadcast)) { + dev_err(dev, "Failed map broadcast address\n"); + return PTR_ERR(c2c_pmu->base_broadcast); + } + + return 0; +} + +static int nv_c2c_pmu_register_pmu(struct nv_c2c_pmu *c2c_pmu) +{ + int ret; + + ret = cpuhp_state_add_instance(nv_c2c_pmu_cpuhp_state, + &c2c_pmu->cpuhp_node); + if (ret) { + dev_err(c2c_pmu->dev, "Error %d registering hotplug\n", ret); + return ret; + } + + c2c_pmu->pmu = (struct pmu) { + .parent = c2c_pmu->dev, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = nv_c2c_pmu_enable, + .pmu_disable = nv_c2c_pmu_disable, + .event_init = nv_c2c_pmu_event_init, + .add = nv_c2c_pmu_add, + .del = nv_c2c_pmu_del, + .start = nv_c2c_pmu_start, + .stop = nv_c2c_pmu_stop, + .read = nv_c2c_pmu_read, + .attr_groups = c2c_pmu->attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | + PERF_PMU_CAP_NO_INTERRUPT, + }; + + ret = perf_pmu_register(&c2c_pmu->pmu, c2c_pmu->name, -1); + if (ret) { + dev_err(c2c_pmu->dev, "Failed to register C2C PMU: %d\n", ret); + cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, + &c2c_pmu->cpuhp_node); + return ret; + } + + return 0; +} + +static int nv_c2c_pmu_probe(struct platform_device *pdev) +{ + int ret; + struct nv_c2c_pmu *c2c_pmu; + + c2c_pmu = nv_c2c_pmu_init_pmu(pdev); + if (IS_ERR(c2c_pmu)) + return PTR_ERR(c2c_pmu); + + ret = nv_c2c_pmu_init_mmio(c2c_pmu); + if (ret) + return ret; + + ret = nv_c2c_pmu_get_cpus(c2c_pmu); + if (ret) + return ret; + + ret = nv_c2c_pmu_register_pmu(c2c_pmu); + if (ret) + return ret; + + dev_dbg(c2c_pmu->dev, "Registered %s PMU\n", c2c_pmu->name); + + return 0; +} + +static void nv_c2c_pmu_device_remove(struct platform_device *pdev) +{ + struct nv_c2c_pmu *c2c_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&c2c_pmu->pmu); + cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, &c2c_pmu->cpuhp_node); +} + +static const struct acpi_device_id nv_c2c_pmu_acpi_match[] = { + { "NVDA2023", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVLINK] }, + { "NVDA2022", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVCLINK] }, + { "NVDA2020", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVDLINK] }, + { } +}; +MODULE_DEVICE_TABLE(acpi, nv_c2c_pmu_acpi_match); + +static struct platform_driver nv_c2c_pmu_driver = { + .driver = { + .name = "nvidia-t410-c2c-pmu", + .acpi_match_table = nv_c2c_pmu_acpi_match, + .suppress_bind_attrs = true, + }, + .probe = nv_c2c_pmu_probe, + .remove = nv_c2c_pmu_device_remove, +}; + +static int __init nv_c2c_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/nvidia/c2c:online", + nv_c2c_pmu_online_cpu, + nv_c2c_pmu_cpu_teardown); + if (ret < 0) + return ret; + + nv_c2c_pmu_cpuhp_state = ret; + return platform_driver_register(&nv_c2c_pmu_driver); +} + +static void __exit nv_c2c_pmu_exit(void) +{ + platform_driver_unregister(&nv_c2c_pmu_driver); + cpuhp_remove_multi_state(nv_c2c_pmu_cpuhp_state); +} + +module_init(nv_c2c_pmu_init); +module_exit(nv_c2c_pmu_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NVIDIA Tegra410 C2C PMU driver"); +MODULE_AUTHOR("Besar Wicaksono "); From 5e154c93f76cd3873027d8954f96d3f1d65ac373 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Thu, 12 Feb 2026 23:34:07 +0000 Subject: [PATCH 16/17] perf vendor events arm64: Add Tegra410 Olympus PMU events Add JSON files for NVIDIA Tegra410 Olympus core PMU events. Also updated the common-and-microarch.json. Signed-off-by: Besar Wicaksono Reviewed-by: James Clark Signed-off-by: Namhyung Kim (cherry picked from commit 86ff690f45cc034ab32246630b3c7d7a46d1ae6b) Signed-off-by: Matthew R. Ochs --- .../arch/arm64/common-and-microarch.json | 85 +++ tools/perf/pmu-events/arch/arm64/mapfile.csv | 1 + .../arch/arm64/nvidia/t410/branch.json | 45 ++ .../arch/arm64/nvidia/t410/brbe.json | 6 + .../arch/arm64/nvidia/t410/bus.json | 48 ++ .../arch/arm64/nvidia/t410/exception.json | 62 ++ .../arch/arm64/nvidia/t410/fp_operation.json | 78 ++ .../arch/arm64/nvidia/t410/general.json | 15 + .../arch/arm64/nvidia/t410/l1d_cache.json | 122 +++ .../arch/arm64/nvidia/t410/l1i_cache.json | 114 +++ .../arch/arm64/nvidia/t410/l2d_cache.json | 134 ++++ .../arch/arm64/nvidia/t410/ll_cache.json | 107 +++ .../arch/arm64/nvidia/t410/memory.json | 46 ++ .../arch/arm64/nvidia/t410/metrics.json | 722 ++++++++++++++++++ .../arch/arm64/nvidia/t410/misc.json | 642 ++++++++++++++++ .../arch/arm64/nvidia/t410/retired.json | 94 +++ .../arch/arm64/nvidia/t410/spe.json | 42 + .../arm64/nvidia/t410/spec_operation.json | 230 ++++++ .../arch/arm64/nvidia/t410/stall.json | 145 ++++ .../arch/arm64/nvidia/t410/tlb.json | 158 ++++ 20 files changed, 2896 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json index 468cb085d8796..144325d87be44 100644 --- a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json +++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json @@ -1512,11 +1512,26 @@ "EventName": "L2D_CACHE_REFILL_PRFM", "BriefDescription": "Level 2 data cache refill, software preload" }, + { + "EventCode": "0x8150", + "EventName": "L3D_CACHE_RW", + "BriefDescription": "Level 3 data cache demand access." + }, + { + "EventCode": "0x8151", + "EventName": "L3D_CACHE_PRFM", + "BriefDescription": "Level 3 data cache software prefetch" + }, { "EventCode": "0x8152", "EventName": "L3D_CACHE_MISS", "BriefDescription": "Level 3 data cache demand access miss" }, + { + "EventCode": "0x8153", + "EventName": "L3D_CACHE_REFILL_PRFM", + "BriefDescription": "Level 3 data cache refill, software prefetch." + }, { "EventCode": "0x8154", "EventName": "L1D_CACHE_HWPRF", @@ -1527,6 +1542,11 @@ "EventName": "L2D_CACHE_HWPRF", "BriefDescription": "Level 2 data cache hardware prefetch." }, + { + "EventCode": "0x8156", + "EventName": "L3D_CACHE_HWPRF", + "BriefDescription": "Level 3 data cache hardware prefetch." + }, { "EventCode": "0x8158", "EventName": "STALL_FRONTEND_MEMBOUND", @@ -1682,6 +1702,11 @@ "EventName": "L2D_CACHE_REFILL_HWPRF", "BriefDescription": "Level 2 data cache refill, hardware prefetch." }, + { + "EventCode": "0x81BE", + "EventName": "L3D_CACHE_REFILL_HWPRF", + "BriefDescription": "Level 3 data cache refill, hardware prefetch." + }, { "EventCode": "0x81C0", "EventName": "L1I_CACHE_HIT_RD", @@ -1712,11 +1737,31 @@ "EventName": "L1I_CACHE_HIT_RD_FPRFM", "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by software preload" }, + { + "EventCode": "0x81DC", + "EventName": "L1D_CACHE_HIT_RW_FPRFM", + "BriefDescription": "Level 1 data cache demand access first hit, fetched by software prefetch." + }, { "EventCode": "0x81E0", "EventName": "L1I_CACHE_HIT_RD_FHWPRF", "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by hardware prefetcher" }, + { + "EventCode": "0x81EC", + "EventName": "L1D_CACHE_HIT_RW_FHWPRF", + "BriefDescription": "Level 1 data cache demand access first hit, fetched by hardware prefetcher." + }, + { + "EventCode": "0x81F0", + "EventName": "L1I_CACHE_HIT_RD_FPRF", + "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by prefetch." + }, + { + "EventCode": "0x81FC", + "EventName": "L1D_CACHE_HIT_RW_FPRF", + "BriefDescription": "Level 1 data cache demand access first hit, fetched by prefetch." + }, { "EventCode": "0x8200", "EventName": "L1I_CACHE_HIT", @@ -1767,11 +1812,26 @@ "EventName": "L1I_LFB_HIT_RD_FPRFM", "BriefDescription": "Level 1 instruction cache demand fetch line-fill buffer first hit, recently fetched by software preload" }, + { + "EventCode": "0x825C", + "EventName": "L1D_LFB_HIT_RW_FPRFM", + "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by software prefetch." + }, { "EventCode": "0x8260", "EventName": "L1I_LFB_HIT_RD_FHWPRF", "BriefDescription": "Level 1 instruction cache demand fetch line-fill buffer first hit, recently fetched by hardware prefetcher" }, + { + "EventCode": "0x826C", + "EventName": "L1D_LFB_HIT_RW_FHWPRF", + "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by hardware prefetcher." + }, + { + "EventCode": "0x827C", + "EventName": "L1D_LFB_HIT_RW_FPRF", + "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by prefetch." + }, { "EventCode": "0x8280", "EventName": "L1I_CACHE_PRF", @@ -1807,6 +1867,11 @@ "EventName": "LL_CACHE_REFILL", "BriefDescription": "Last level cache refill" }, + { + "EventCode": "0x828E", + "EventName": "L3D_CACHE_REFILL_PRF", + "BriefDescription": "Level 3 data cache refill, prefetch." + }, { "EventCode": "0x8320", "EventName": "L1D_CACHE_REFILL_PERCYC", @@ -1872,6 +1937,16 @@ "EventName": "FP_FP8_MIN_SPEC", "BriefDescription": "Floating-point operation speculatively_executed, smallest type is 8-bit floating-point." }, + { + "EventCode": "0x8480", + "EventName": "FP_SP_FIXED_MIN_OPS_SPEC", + "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is single-precision floating-point." + }, + { + "EventCode": "0x8482", + "EventName": "FP_HP_FIXED_MIN_OPS_SPEC", + "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is half-precision floating-point." + }, { "EventCode": "0x8483", "EventName": "FP_BF16_FIXED_MIN_OPS_SPEC", @@ -1882,6 +1957,16 @@ "EventName": "FP_FP8_FIXED_MIN_OPS_SPEC", "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is 8-bit floating-point." }, + { + "EventCode": "0x8488", + "EventName": "FP_SP_SCALE_MIN_OPS_SPEC", + "BriefDescription": "Scalable element arithmetic operations speculatively executed, smallest type is single-precision floating-point." + }, + { + "EventCode": "0x848A", + "EventName": "FP_HP_SCALE_MIN_OPS_SPEC", + "BriefDescription": "Scalable element arithmetic operations speculatively executed, smallest type is half-precision floating-point." + }, { "EventCode": "0x848B", "EventName": "FP_BF16_SCALE_MIN_OPS_SPEC", diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index bb3fa8a33496a..7f0eaa7020485 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -46,3 +46,4 @@ 0x00000000500f0000,v1,ampere/emag,core 0x00000000c00fac30,v1,ampere/ampereone,core 0x00000000c00fac40,v1,ampere/ampereonex,core +0x000000004e0f0100,v1,nvidia/t410,core diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json new file mode 100644 index 0000000000000..ef4effc00ec3a --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json @@ -0,0 +1,45 @@ +[ + { + "ArchStdEvent": "BR_MIS_PRED", + "PublicDescription": "This event counts branches which are speculatively executed and mispredicted." + }, + { + "ArchStdEvent": "BR_PRED", + "PublicDescription": "This event counts all speculatively executed branches." + }, + { + "EventCode": "0x017e", + "EventName": "BR_PRED_BTB_CTX_UPDATE", + "PublicDescription": "Branch context table update." + }, + { + "EventCode": "0x0188", + "EventName": "BR_MIS_PRED_DIR_RESOLVED", + "PublicDescription": "Number of branch misprediction due to direction misprediction." + }, + { + "EventCode": "0x0189", + "EventName": "BR_MIS_PRED_DIR_UNCOND_RESOLVED", + "PublicDescription": "Number of branch misprediction due to direction misprediction for unconditional branches." + }, + { + "EventCode": "0x018a", + "EventName": "BR_MIS_PRED_DIR_UNCOND_DIRECT_RESOLVED", + "PublicDescription": "Number of branch misprediction due to direction misprediction for unconditional direct branches." + }, + { + "EventCode": "0x018b", + "EventName": "BR_PRED_MULTI_RESOLVED", + "PublicDescription": "Number of resolved branch which made prediction by polymorphic indirect predictor." + }, + { + "EventCode": "0x018c", + "EventName": "BR_MIS_PRED_MULTI_RESOLVED", + "PublicDescription": "Number of branch misprediction which made prediction by polymorphic indirect predictor." + }, + { + "EventCode": "0x01e4", + "EventName": "BR_RGN_RECLAIM", + "PublicDescription": "This event counts the Indirect predictor entries flushed by region reclamation." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json new file mode 100644 index 0000000000000..9c315b2d70469 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json @@ -0,0 +1,6 @@ +[ + { + "ArchStdEvent": "BRB_FILTRATE", + "PublicDescription": "This event counts each valid branch record captured in the branch record buffer. Branch records that are not captured because they are removed by filtering are not counted." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json new file mode 100644 index 0000000000000..5bb8de617c68b --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json @@ -0,0 +1,48 @@ +[ + { + "ArchStdEvent": "BUS_ACCESS", + "PublicDescription": "This event counts the number of data-beat accesses between the CPU and the external bus. This count includes accesses due to read, write, and snoop. Each beat of data is counted individually." + }, + { + "ArchStdEvent": "BUS_CYCLES", + "PublicDescription": "This event counts bus cycles in the CPU. Bus cycles represent a clock cycle in which a transaction could be sent or received on the interface from the CPU to the external bus. Since that interface is driven at the same clock speed as the CPU, this event increments at the rate of CPU clock. Regardless of the WFE/WFI state of the PE, this event increments on each processor clock." + }, + { + "ArchStdEvent": "BUS_ACCESS_RD", + "PublicDescription": "This event counts memory Read transactions seen on the external bus. Each beat of data is counted individually." + }, + { + "ArchStdEvent": "BUS_ACCESS_WR", + "PublicDescription": "This event counts memory Write transactions seen on the external bus. Each beat of data is counted individually." + }, + { + "EventCode": "0x0154", + "EventName": "BUS_REQUEST_REQ", + "PublicDescription": "Bus request, request." + }, + { + "EventCode": "0x0155", + "EventName": "BUS_REQUEST_RETRY", + "PublicDescription": "Bus request, retry." + }, + { + "EventCode": "0x0198", + "EventName": "L2_CHI_CBUSY0", + "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 0." + }, + { + "EventCode": "0x0199", + "EventName": "L2_CHI_CBUSY1", + "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 1." + }, + { + "EventCode": "0x019a", + "EventName": "L2_CHI_CBUSY2", + "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 2." + }, + { + "EventCode": "0x019b", + "EventName": "L2_CHI_CBUSY3", + "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 3." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json new file mode 100644 index 0000000000000..ecd996c3610be --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json @@ -0,0 +1,62 @@ +[ + { + "ArchStdEvent": "EXC_TAKEN", + "PublicDescription": "This event counts any taken architecturally visible exceptions such as IRQ, FIQ, SError, and other synchronous exceptions. Exceptions are counted whether or not they are taken locally." + }, + { + "ArchStdEvent": "EXC_RETURN", + "PublicDescription": "This event counts any architecturally executed exception return instructions. For example: AArch64: ERET." + }, + { + "ArchStdEvent": "EXC_UNDEF", + "PublicDescription": "This event counts the number of synchronous exceptions which are taken locally that are due to attempting to execute an instruction that is UNDEFINED.\nAttempting to execute instruction bit patterns that have not been allocated.\nAttempting to execute instructions when they are disabled.\nAttempting to execute instructions at an inappropriate Exception level.\nAttempting to execute an instruction when the value of PSTATE.IL is 1." + }, + { + "ArchStdEvent": "EXC_SVC", + "PublicDescription": "This event counts SVC exceptions taken locally." + }, + { + "ArchStdEvent": "EXC_PABORT", + "PublicDescription": "This event counts synchronous exceptions that are taken locally and caused by Instruction Aborts." + }, + { + "ArchStdEvent": "EXC_DABORT", + "PublicDescription": "This event counts exceptions that are taken locally and are caused by data aborts or SErrors. Conditions that could cause those exceptions are attempting to read or write memory where the MMU generates a fault, attempting to read or write memory with a misaligned address, Interrupts from the nSEI inputs and internally generated SErrors." + }, + { + "ArchStdEvent": "EXC_IRQ", + "PublicDescription": "This event counts IRQ exceptions including the virtual IRQs that are taken locally." + }, + { + "ArchStdEvent": "EXC_FIQ", + "PublicDescription": "This event counts FIQ exceptions including the virtual FIQs that are taken locally." + }, + { + "ArchStdEvent": "EXC_SMC", + "PublicDescription": "This event counts SMC exceptions taken to EL3." + }, + { + "ArchStdEvent": "EXC_HVC", + "PublicDescription": "This event counts HVC exceptions taken to EL2." + }, + { + "ArchStdEvent": "EXC_TRAP_PABORT", + "PublicDescription": "This event counts exceptions which are traps not taken locally and are caused by Instruction Aborts. For example, attempting to execute an instruction with a misaligned PC." + }, + { + "ArchStdEvent": "EXC_TRAP_DABORT", + "PublicDescription": "This event counts exceptions which are traps not taken locally and are caused by Data Aborts or SError Interrupts. Conditions that could cause those exceptions are:\n* Attempting to read or write memory where the MMU generates a fault,\n* Attempting to read or write memory with a misaligned address,\n* Interrupts from the SEI input,\n* Internally generated SErrors." + }, + { + "ArchStdEvent": "EXC_TRAP_OTHER", + "PublicDescription": "This event counts the number of synchronous trap exceptions which are not taken locally and are not SVC, SMC, HVC, Data Aborts, Instruction Aborts, or Interrupts." + }, + { + "ArchStdEvent": "EXC_TRAP_IRQ", + "PublicDescription": "This event counts IRQ exceptions including the virtual IRQs that are not taken locally." + }, + { + "ArchStdEvent": "EXC_TRAP_FIQ", + "PublicDescription": "This event counts FIQs which are not taken locally but taken from EL0, EL1, or EL2 to EL3 (which would be the normal behavior for FIQs when not executing in EL3)." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json new file mode 100644 index 0000000000000..3588e130781db --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json @@ -0,0 +1,78 @@ +[ + { + "ArchStdEvent": "FP_HP_SPEC", + "PublicDescription": "This event counts speculatively executed half precision floating point operations." + }, + { + "ArchStdEvent": "FP_SP_SPEC", + "PublicDescription": "This event counts speculatively executed single precision floating point operations." + }, + { + "ArchStdEvent": "FP_DP_SPEC", + "PublicDescription": "This event counts speculatively executed double precision floating point operations." + }, + { + "ArchStdEvent": "FP_SCALE_OPS_SPEC", + "PublicDescription": "This event counts speculatively executed scalable single precision floating point operations." + }, + { + "ArchStdEvent": "FP_FIXED_OPS_SPEC", + "PublicDescription": "This event counts speculatively executed non-scalable single precision floating point operations." + }, + { + "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the largest type was half-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the counter to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the largest type was half-precision floating-point, where v is the number of arithmetic operations carried out by the operation or which instruction causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the largest type was single-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the largest type was single-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the largest type was double-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the largest type was double-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_SP_FIXED_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was single-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_HP_FIXED_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was half-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_BF16_FIXED_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was BFloat16 floating-point. Where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment. This event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_FP8_FIXED_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was 8-bit floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_SP_SCALE_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was single-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_HP_SCALE_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was half-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_BF16_SCALE_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was BFloat16 floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + }, + { + "ArchStdEvent": "FP_FP8_SCALE_MIN_OPS_SPEC", + "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was 8-bit floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json new file mode 100644 index 0000000000000..bd9c248387aae --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json @@ -0,0 +1,15 @@ +[ + { + "ArchStdEvent": "CPU_CYCLES", + "PublicDescription": "This event counts CPU clock cycles when the PE is not in WFE/WFI. The clock measured by this event is defined as the physical clock driving the CPU logic." + }, + { + "ArchStdEvent": "CNT_CYCLES", + "PublicDescription": "This event increments at a constant frequency equal to the rate of increment of the System Counter, CNTPCT_EL0.\nThis event does not increment when the PE is in WFE/WFI." + }, + { + "EventCode": "0x01e1", + "EventName": "CPU_SLOT", + "PublicDescription": "Entitled CPU slots.\nThis event counts the number of slots. When in ST mode, this event shall increment by PMMIR_EL1.SLOTS quantities, and when in SMT partitioned resource mode (regardless of in WFI state or otherwise), this event is incremented by PMMIR_EL1.SLOTS/2 quantities." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json new file mode 100644 index 0000000000000..ed6f764eff242 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json @@ -0,0 +1,122 @@ +[ + { + "ArchStdEvent": "L1D_CACHE_REFILL", + "PublicDescription": "This event counts L1 D-cache refills caused by speculatively executed load or store operations, preload instructions, or hardware cache prefetching that missed in the L1 D-cache. This event only counts one event per cache line.\nSince the caches are Write-back only for this processor, there are no Write-through cache accesses." + }, + { + "ArchStdEvent": "L1D_CACHE", + "PublicDescription": "This event counts L1 D-cache accesses from any load/store operations, software preload, or hardware prefetch operations. Atomic operations that resolve in the CPU's caches (near atomic operations) count as both a write access and read access. Each access to a cache line is counted including the multiple accesses caused by single instructions such as LDM or STM. Each access to other L1 data or unified memory structures, for example refill buffers, write buffers, and write-back buffers, are also counted.\nThis event counts the sum of the following events:\nL1D_CACHE_RD,\nL1D_CACHE_WR,\nL1D_CACHE_PRFM, and\nL1D_CACHE_HWPRF." + }, + { + "ArchStdEvent": "L1D_CACHE_WB", + "PublicDescription": "This event counts write-backs of dirty data from the L1 D-cache to the L2 cache. This occurs when either a dirty cache line is evicted from L1 D-cache and allocated in the L2 cache or dirty data is written to the L2 and possibly to the next level of cache. This event counts both victim cache line evictions and cache write-backs from snoops or cache maintenance operations. The following cache operations are not counted:\n* Invalidations which do not result in data being transferred out of the L1 (such as evictions of clean data),\n* Full line writes which write to L2 without writing L1, such as write streaming mode.\nThis event is the sum of the following events:\nL1D_CACHE_WB_CLEAN and\nL1D_CACHE_WB_VICTIM." + }, + { + "ArchStdEvent": "L1D_CACHE_LMISS_RD", + "PublicDescription": "This event counts cache line refills into the L1 D-cache from any memory Read operations, that incurred additional latency.\nCounts same as L1D_CACHE_REFILL_RD on this CPU." + }, + { + "ArchStdEvent": "L1D_CACHE_RD", + "PublicDescription": "This event counts L1 D-cache accesses from any Load operation. Atomic Load operations that resolve in the CPU's caches count as both a write access and read access." + }, + { + "ArchStdEvent": "L1D_CACHE_WR", + "PublicDescription": "This event counts L1 D-cache accesses generated by Store operations. This event also counts accesses caused by a DC ZVA (D-cache zero, specified by virtual address) instruction. Near atomic operations that resolve in the CPU's caches count as a write access and read access.\nThis event is a subset of the L1D_CACHE event, except this event only counts memory Write operations." + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_RD", + "PublicDescription": "This event counts L1 D-cache refills caused by speculatively executed Load instructions where the memory Read operation misses in the L1 D-cache. This event only counts one event per cache line.\nThis event is a subset of the L1D_CACHE_REFILL event, but only counts memory Read operations. This event does not count reads caused by cache maintenance operations or preload instructions." + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_WR", + "PublicDescription": "This event counts L1 D-cache refills caused by speculatively executed Store instructions where the memory Write operation misses in the L1 D-cache. This event only counts one event per cache line.\nThis event is a subset of the L1D_CACHE_REFILL event, but only counts memory Write operations." + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_INNER", + "PublicDescription": "This event counts L1 D-cache refills (L1D_CACHE_REFILL) where the cache line data came from caches inside the immediate Cluster of the Core (L2 cache)." + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_OUTER", + "PublicDescription": "This event counts L1 D-cache refills (L1D_CACHE_REFILL) for which the cache line data came from outside the immediate Cluster of the Core, like an SLC in the system interconnect or DRAM or remote socket." + }, + { + "ArchStdEvent": "L1D_CACHE_WB_VICTIM", + "PublicDescription": "This event counts dirty cache line evictions from the L1 D-cache caused by a new cache line allocation. This event does not count evictions caused by cache maintenance operations.\nThis event is a subset of the L1D_CACHE_WB event, but only counts write-backs that are a result of the line being allocated for an access made by the CPU." + }, + { + "ArchStdEvent": "L1D_CACHE_WB_CLEAN", + "PublicDescription": "This event counts write-backs from the L1 D-cache that are a result of a coherency operation made by another CPU. Event counts include cache maintenance operations.\nThis event is a subset of the L1D_CACHE_WB event." + }, + { + "ArchStdEvent": "L1D_CACHE_INVAL", + "PublicDescription": "This event counts each explicit invalidation of a cache line in the L1 D-cache caused by:\n* Cache Maintenance Operations (CMO) that operate by a virtual address.\n* Broadcast cache coherency operations from another CPU in the system.\nThis event does not count for the following conditions:\n* A cache refill invalidates a cache line.\n* A CMO which is executed on that CPU and invalidates a cache line specified by Set/Way.\nNote that CMOs that operate by Set/Way cannot be broadcast from one CPU to another." + }, + { + "ArchStdEvent": "L1D_CACHE_RW", + "PublicDescription": "This event counts L1 data demand cache accesses from any Load or Store operation. Near atomic operations that resolve in the CPU's caches count as both a write access and read access.\nThis event is implemented as L1D_CACHE_RD + L1D_CACHE_WR" + }, + { + "ArchStdEvent": "L1D_CACHE_PRFM", + "PublicDescription": "This event counts L1 D-cache accesses from software preload or prefetch instructions." + }, + { + "ArchStdEvent": "L1D_CACHE_MISS", + "PublicDescription": "This event counts each demand access counted by L1D_CACHE_RW that misses in the L1 Data or unified cache, causing an access to outside of the L1 caches of this PE." + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_PRFM", + "PublicDescription": "This event counts L1 D-cache refills where the cache line access was generated by software preload or prefetch instructions." + }, + { + "ArchStdEvent": "L1D_CACHE_HWPRF", + "PublicDescription": "This event counts L1 D-cache accesses from any Load/Store operations generated by the hardware prefetcher." + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_HWPRF", + "PublicDescription": "This event counts each hardware prefetch access counted by L1D_CACHE_HWPRF that causes a refill of the L1 D-cache from outside of the L1 D-cache." + }, + { + "ArchStdEvent": "L1D_CACHE_HIT_RW_FPRFM", + "PublicDescription": "This event counts each demand access first hit counted by L1D_CACHE_HIT_RW_FPRF where the cache line was fetched in response to a prefetch instruction. That is, the L1D_CACHE_REFILL_PRFM event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "ArchStdEvent": "L1D_CACHE_HIT_RW_FHWPRF", + "PublicDescription": "This event counts each demand access first hit counted by L1D_CACHE_HIT_RW_FPRF where the cache line was fetched by a hardware prefetcher. That is, the L1D_CACHE_REFILL_HWPRF Event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "ArchStdEvent": "L1D_CACHE_HIT_RW_FPRF", + "PublicDescription": "This event counts each demand access first hit counted by L1D_CACHE_HIT_RW where the cache line was fetched in response to a prefetch instruction or by a hardware prefetcher. That is, the L1D_CACHE_REFILL_PRF event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "ArchStdEvent": "L1D_LFB_HIT_RW_FPRFM", + "PublicDescription": "This event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW_FPRF where the cache line was fetched in response to a prefetch instruction. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_PRFM event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "ArchStdEvent": "L1D_LFB_HIT_RW_FHWPRF", + "PublicDescription": "This event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW_FPRF, where the cache line was fetched by a hardware prefetcher. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_HWPRF Event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "ArchStdEvent": "L1D_LFB_HIT_RW_FPRF", + "PublicDescription": "This event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW where the cache line was fetched in response to a prefetch instruction or by a hardware prefetcher. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_PRF event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "EventCode": "0x01f5", + "EventName": "L1D_CACHE_REFILL_RW", + "PublicDescription": "L1 D-cache refill, demand Read and Write. This event counts demand Read and Write accesses that causes a refill of the L1 D-cache of this PE, from outside of this cache." + }, + { + "EventCode": "0x0204", + "EventName": "L1D_CACHE_REFILL_OUTER_LLC", + "PublicDescription": "This event counts L1D_CACHE_REFILL from L3 D-cache." + }, + { + "EventCode": "0x0205", + "EventName": "L1D_CACHE_REFILL_OUTER_DRAM", + "PublicDescription": "This event counts L1D_CACHE_REFILL from local memory." + }, + { + "EventCode": "0x0206", + "EventName": "L1D_CACHE_REFILL_OUTER_REMOTE", + "PublicDescription": "This event counts L1D_CACHE_REFILL from a remote memory." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json new file mode 100644 index 0000000000000..952454004d986 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json @@ -0,0 +1,114 @@ +[ + { + "ArchStdEvent": "L1I_CACHE_REFILL", + "PublicDescription": "This event counts cache line refills in the L1 I-cache caused by a missed instruction fetch (demand, hardware prefetch, and software preload accesses). Instruction fetches may include accessing multiple instructions, but the single cache line allocation is counted once." + }, + { + "ArchStdEvent": "L1I_CACHE", + "PublicDescription": "This event counts instruction fetches (demand, hardware prefetch, and software preload accesses) which access the L1 Instruction Cache. Instruction Cache accesses caused by cache maintenance operations are not counted." + }, + { + "ArchStdEvent": "L1I_CACHE_LMISS", + "PublicDescription": "This event counts cache line refills into the L1 I-cache, that incurred additional latency.\nCounts the same as L1I_CACHE_REFILL in this CPU." + }, + { + "ArchStdEvent": "L1I_CACHE_RD", + "PublicDescription": "This event counts demand instruction fetches which access the L1 I-cache." + }, + { + "ArchStdEvent": "L1I_CACHE_PRFM", + "PublicDescription": "This event counts instruction fetches generated by software preload or prefetch instructions which access the L1 I-cache." + }, + { + "ArchStdEvent": "L1I_CACHE_HWPRF", + "PublicDescription": "This event counts instruction fetches which access the L1 I-cache generated by the hardware prefetcher." + }, + { + "ArchStdEvent": "L1I_CACHE_REFILL_PRFM", + "PublicDescription": "This event counts cache line refills in the L1 I-cache caused by a missed instruction fetch generated by software preload or prefetch instructions. Instruction fetches may include accessing multiple instructions, but the single cache line allocation is counted once." + }, + { + "ArchStdEvent": "L1I_CACHE_REFILL_HWPRF", + "PublicDescription": "This event counts each hardware prefetch access counted by L1I_CACHE_HWPRF that causes a refill of the Level 1I-cache from outside of the L1 I-cache." + }, + { + "ArchStdEvent": "L1I_CACHE_HIT_RD", + "PublicDescription": "This event counts demand instruction fetches that access the L1 I-cache and hit in the L1 I-cache." + }, + { + "ArchStdEvent": "L1I_CACHE_HIT_RD_FPRF", + "PublicDescription": "This event counts each demand fetch first hit counted by L1I_CACHE_HIT_RD where the cache line was fetched in response to a software preload or by a hardware prefetcher. That is, the L1I_CACHE_REFILL_PRF event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "ArchStdEvent": "L1I_CACHE_HIT", + "PublicDescription": "This event counts instruction fetches that access the L1 I-cache (demand, hardware prefetch, and software preload accesses) and hit in the L1 I-cache. I-cache accesses caused by cache maintenance operations are not counted." + }, + { + "ArchStdEvent": "L1I_CACHE_HIT_PRFM", + "PublicDescription": "This event counts instruction fetches generated by software preload or prefetch instructions that access the L1 I-cache and hit in the L1 I-cache." + }, + { + "ArchStdEvent": "L1I_LFB_HIT_RD", + "PublicDescription": "This event counts demand instruction fetches that access the L1 I-cache and hit in a line that is in the process of being loaded into the L1 I-cache." + }, + { + "EventCode": "0x0174", + "EventName": "L1I_HWPRF_REQ_DROP", + "PublicDescription": "L1 I-cache hardware prefetch dropped." + }, + { + "EventCode": "0x01e3", + "EventName": "L1I_CACHE_REFILL_RD", + "PublicDescription": "L1 I-cache refill, Read.\nThis event counts demand instruction fetch that causes a refill of the L1 I-cache of this PE, from outside of this cache." + }, + { + "EventCode": "0x01ea", + "EventName": "L1I_CFC_ENTRIES", + "PublicDescription": "This event counts the CFC (Cache Fill Control) entries.\nThe CFC is the fill buffer for I-cache." + }, + { + "EventCode": "0x01ef", + "EventName": "L1I_CACHE_INVAL", + "PublicDescription": "L1 I-cache invalidate.\nThis event counts each explicit invalidation of a cache line in the L1 I-cache caused by:\n* Broadcast cache coherency operations from another CPU in the system.\n* Invalidation dues to capacity eviction in L2 D-cache.\nThis event does not count for the following conditions:\n* A cache refill invalidates a cache line.\n* A CMO which is executed on that CPU Core and invalidates a cache line specified by Set/Way.\n* Cache Maintenance Operations (CMO) that operate by a virtual address.\nNote that\n* CMOs that operate by Set/Way cannot be broadcast from one CPU Core to another.\n* The CMO is treated as No-op for the purposes of L1 I-cache line invalidation, as this Core implements fully coherent I-cache." + }, + { + "EventCode": "0x0212", + "EventName": "L1I_CACHE_HIT_HWPRF", + "PublicDescription": "This event counts each hardware prefetch access that hits an L1 I-cache." + }, + { + "EventCode": "0x0215", + "EventName": "L1I_LFB_HIT", + "PublicDescription": "L1 Line fill buffer hit.\nThis event counts each Demand or software preload or hardware prefetch induced instruction fetch that hits an L1 I-cache line that is in the process of being loaded into the L1 instruction cache, and so does not generate a new refill, but has to wait for the previous refill to complete." + }, + { + "EventCode": "0x0216", + "EventName": "L1I_LFB_HIT_PRFM", + "PublicDescription": "This event counts each software prefetch access that hits a cache line that is in the process of being loaded into the L1 instruction cache, and so does not generate a new refill, but has to wait for the previous refill to complete." + }, + { + "EventCode": "0x0219", + "EventName": "L1I_LFB_HIT_HWPRF", + "PublicDescription": "This event counts each hardware prefetch access that hits a cache line that is in the process of being loaded into the L1 instruction cache, and so does not generate a new refill, but has to wait for the previous refill to complete." + }, + { + "EventCode": "0x0221", + "EventName": "L1I_PRFM_REQ", + "PublicDescription": "L1 I-cache software prefetch requests." + }, + { + "EventCode": "0x0222", + "EventName": "L1I_HWPRF_REQ", + "PublicDescription": "L1 I-cache hardware prefetch requests." + }, + { + "EventCode": "0x0228", + "EventName": "L1I_CACHE_HIT_PRFM_FPRF", + "PublicDescription": "L1 I-cache software prefetch access first hit, fetched by hardware or software prefetch.\nThis event counts each software preload access first hit where the cache line was fetched in response to a hardware prefetcher or software preload instruction.\nOnly the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "EventCode": "0x022a", + "EventName": "L1I_CACHE_HIT_HWPRF_FPRF", + "PublicDescription": "L1 I-cache hardware prefetch access first hit, fetched by hardware or software prefetch.\nThis event counts each hardware prefetch access first hit where the cache line was fetched in response to a hardware or prefetch instruction.\nOnly the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json new file mode 100644 index 0000000000000..66f21a94381ed --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json @@ -0,0 +1,134 @@ +[ + { + "ArchStdEvent": "L2D_CACHE", + "PublicDescription": "This event counts accesses to the L2 cache due to data accesses. L2 cache is a unified cache for data and instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This event also counts write-back of dirty data from L1 D-cache to the L2 cache.\nI-cache accesses are included in this event. This event is the sum of the following events:\nL2D_CACHE_RD,\nL2D_CACHE_WR,\nL2D_CACHE_PRFM, and\nL2D_CACHE_HWPRF." + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL", + "PublicDescription": "This event counts cache line refills into the L2 cache. L2 cache is a unified cache for data and instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nI-cache refills are included in this event. This event is the sum of the following events:\nL2D_CACHE_REFILL_RD,\nL2D_CACHE_REFILL_WR,\nL2D_CACHE_REFILL_HWPRF, and\nL2D_CACHE_REFILL_PRFM." + }, + { + "ArchStdEvent": "L2D_CACHE_WB", + "PublicDescription": "This event counts write-backs of data from the L2 cache to outside the CPU. This includes snoops to the L2 (from other CPUs) which return data even if the snoops cause an invalidation. L2 cache line invalidations which do not write data outside the CPU and snoops which return data from an L1 cache are not counted. Data would not be written outside the cache when invalidating a clean cache line.\nThis event is the sum of the following events:\nL2D_CACHE_WB_VICTIM and\nL2D_CACHE_WB_CLEAN." + }, + { + "ArchStdEvent": "L2D_CACHE_RD", + "PublicDescription": "This event counts L2 D-cache accesses due to memory Read operations. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nI-cache accesses are included in this event. This event is a subset of the L2D_CACHE event, but this event only counts memory Read operations." + }, + { + "ArchStdEvent": "L2D_CACHE_WR", + "PublicDescription": "This event counts L2 cache accesses due to memory Write operations. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis event is a subset of the L2D_CACHE event, but this event only counts memory Write operations." + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_RD", + "PublicDescription": "This event counts refills for memory accesses due to memory Read operation counted by L2D_CACHE_RD. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis CPU includes I-cache refills in this counter as an L2I equivalent event was not implemented. This event is a subset of the L2D_CACHE_REFILL event. This event does not count L2 refills caused by stashes into L2.\nThis count includes demand requests that encounter an L2 prefetch request or an L2 software prefetch request to the same cache line, which is still pending in the L2 LFB." + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_WR", + "PublicDescription": "This event counts refills for memory accesses due to memory Write operation counted by L2D_CACHE_WR. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis count includes demand requests that encounter an L2 prefetch request or an L2 software prefetch request to the same cache line, which is still pending in the L2 LFB." + }, + { + "ArchStdEvent": "L2D_CACHE_WB_VICTIM", + "PublicDescription": "This event counts evictions from the L2 cache because of a line being allocated into the L2 cache.\nThis event is a subset of the L2D_CACHE_WB event." + }, + { + "ArchStdEvent": "L2D_CACHE_WB_CLEAN", + "PublicDescription": "This event counts write-backs from the L2 cache that are a result of any of the following:\n* Cache maintenance operations,\n* Snoop responses, or\n* Direct cache transfers to another CPU due to a forwarding snoop request.\nThis event is a subset of the L2D_CACHE_WB event." + }, + { + "ArchStdEvent": "L2D_CACHE_INVAL", + "PublicDescription": "This event counts each explicit invalidation of a cache line in the L2 cache by cache maintenance operations that operate by a virtual address, or by external coherency operations. This event does not count if either:\n* A cache refill invalidates a cache line, or\n* A cache Maintenance Operation (CMO), which invalidates a cache line specified by Set/Way,\nis executed on that CPU.\nCMOs that operate by Set/Way cannot be broadcast from one CPU to another." + }, + { + "ArchStdEvent": "L2D_CACHE_LMISS_RD", + "PublicDescription": "This event counts cache line refills into the L2 unified cache from any memory Read operations that incurred additional latency.\nCounts the same as L2D_CACHE_REFILL_RD in this CPU" + }, + { + "ArchStdEvent": "L2D_CACHE_RW", + "PublicDescription": "This event counts L2 cache demand accesses from any Load/Store operations. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nI-cache accesses are included in this event.\nThis event is the sum of the following events:\nL2D_CACHE_RD and\nL2D_CACHE_WR." + }, + { + "ArchStdEvent": "L2D_CACHE_PRFM", + "PublicDescription": "This event counts L2 D-cache accesses generated by software preload or prefetch instructions with target = L1/L2/L3 cache.\nNote that a software preload or prefetch instructions with (target = L1/L2/L3) that hits in L1D will not result in an L2 D-cache access. Therefore, such a software preload or prefetch instructions will not be counted by this event." + }, + { + "ArchStdEvent": "L2D_CACHE_MISS", + "PublicDescription": "This event counts cache line misses in the L2 cache. L2 cache is a unified cache for data and instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis event counts the same as L2D_CACHE_REFILL_RD in this CPU." + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_PRFM", + "PublicDescription": "This event counts refills due to accesses generated as a result of software preload or prefetch instructions as counted by L2D_CACHE_PRFM. I-cache refills are included in this event." + }, + { + "ArchStdEvent": "L2D_CACHE_HWPRF", + "PublicDescription": "This event counts the L2 D-cache access caused by L1 or L2 hardware prefetcher." + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_HWPRF", + "PublicDescription": "This event counts each hardware prefetch access counted by L2D_CACHE_HWPRF that causes a refill of the L2 cache, or any L1 Data, or Instruction cache of this PE, from outside of those caches.\nThis does not include prefetch requests pending waiting for a refill in LFB and a new demand request to the same cache line hitting the LFB entry. All such refills are counted as L2D_LFB_HIT_RWL1PRF_FHWPRF." + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_PRF", + "PublicDescription": "This event counts each access to L2 Cache due to a prefetch instruction, or hardware prefetch that causes a refill of the L2 or any Level 1, from outside of those caches." + }, + { + "EventCode": "0x0108", + "EventName": "L2D_CACHE_IF_REFILL", + "PublicDescription": "L2 D-cache refill, instruction fetch.\nThis event counts demand instruction fetch that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x0109", + "EventName": "L2D_CACHE_TBW_REFILL", + "PublicDescription": "L2 D-cache refill, Page table walk.\nThis event counts demand translation table walk that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x010a", + "EventName": "L2D_CACHE_PF_REFILL", + "PublicDescription": "L2 D-cache refill, prefetch.\nThis event counts L1 or L2 hardware or software prefetch accesses that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x010b", + "EventName": "L2D_LFB_HIT_RWL1PRF_FHWPRF", + "PublicDescription": "L2 line fill buffer demand Read, demand Write or L1 prefetch first hit, fetched by hardware prefetch.\nThis event counts each of the following access that hit the line-fill buffer when the same cache line is already being fetched due to an L2 hardware prefetcher.\n* Demand Read or Write\n* L1I-HWPRF\n* L1D-HWPRF\n* L1I PRFM\n* L1D PRFM\nThese accesses hit a cache line that is currently being loaded into the L2 cache as a result of a hardware prefetcher to the same line. Consequently, this access does not initiate a new refill but waits for the completion of the previous refill.\nOnly the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "EventCode": "0x0179", + "EventName": "L2D_CACHE_HIT_RWL1PRF_FHWPRF", + "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by hardware prefetch. This event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 hardware prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "EventCode": "0x01b8", + "EventName": "L2D_CACHE_L1PRF", + "PublicDescription": "L2 D-cache access, L1 hardware or software prefetch. This event counts L1 Hardware or software prefetch access to L2 D-cache." + }, + { + "EventCode": "0x01b9", + "EventName": "L2D_CACHE_REFILL_L1PRF", + "PublicDescription": "L2 D-cache refill, L1 hardware or software prefetch.\nThis event counts each access counted by L2D_CACHE_L1PRF that causes a refill of the L2 cache or any L1 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x0201", + "EventName": "L2D_CACHE_BACKSNOOP_L1D_VIRT_ALIASING", + "PublicDescription": "This event counts when the L2 D-cache sends an invalidating back-snoop to the L1 D for an access initiated by the L1 D, where the corresponding line is already present in the L1 D-cache.\nThe L2 D-cache line tags the PE that refilled the line. It also retains specific bits of the VA to identify virtually aliased addresses.\nThe L1 D request requiring a back-snoop can originate either from the same PE that refilled the L2 D line or from a different PE. In either case, this event only counts those back snoop where the requested VA mismatch the VA stored in the L2 D tag.\nThis event is counted only by PE that initiated the original request necessitating a back-snoop.\nNote : The L1 D is VIPT, it identifies this access as a miss. Conversely, as L2 is PIPT, it identifies this as a hit. L2 D utilizes the back-snoop mechanism to refill L1 D with the snooped data." + }, + { + "EventCode": "0x0208", + "EventName": "L2D_CACHE_RWL1PRF", + "PublicDescription": "L2 D-cache access, demand Read, demand Write or L1 hardware or software prefetch.\nThis event counts each access to L2 D-cache due to the following:\n* Demand Read or Write.\n* L1 Hardware or software prefetch." + }, + { + "EventCode": "0x020a", + "EventName": "L2D_CACHE_REFILL_RWL1PRF", + "PublicDescription": "L2 D-cache refill, demand Read, demand Write or L1 hardware or software prefetch.\nThis event counts each access counted by L2D_CACHE_RWL1PRF that causes a refill of the L2 cache, or any L1 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x020c", + "EventName": "L2D_CACHE_HIT_RWL1PRF_FPRFM", + "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by software prefetch.\nThis event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 software prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "EventCode": "0x020e", + "EventName": "L2D_CACHE_HIT_RWL1PRF_FPRF", + "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by software or hardware prefetch.\nThis event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 hardware prefetch or software prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json new file mode 100644 index 0000000000000..851d0a70de9c0 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json @@ -0,0 +1,107 @@ +[ + { + "ArchStdEvent": "L3D_CACHE_ALLOCATE", + "PublicDescription": "This event counts each memory Write operation that writes an entire line into the L3 data without fetching data from outside the L3 Data. These are allocations of cache lines in the L3 Data that are not refills counted by\nL3D_CACHE_REFILL. For example:\nA Write-back of an entire cache line from an L2 cache to the L3 D-cache.\n* A Write of an entire cache line from a coalescing Write buffer.\n* An operation such as DC ZVA.\nThis counter does not count writes that write an entire line to beyond level 3. Thus this counter does not count the streaming writes to beyond L3 cache." + }, + { + "ArchStdEvent": "L3D_CACHE_REFILL", + "PublicDescription": "This event counts each access counted by L3D_CACHE that causes a refill of the L3 Data, or any L1 Data, instruction or L2 cache of this PE, from outside of those caches. This includes the refill due to hardware prefetch and software prefetch accesses.\nThis event is a sum of L3D_CACHE_MISS, L3D_CACHE_REFILL_PRFM and L3D_CACHE_REFILL_HWPRF event.\nA refill includes any access that causes data to be fetched from outside of the L1 to L3 caches, even if the data is ultimately not allocated into the L3 D-cache." + }, + { + "ArchStdEvent": "L3D_CACHE", + "PublicDescription": "This event counts each memory Read operation or memory Write operation that causes a cache access to the Level 3.\nThis event is a sum of the following Events:\n* L3D_CACHE_RD(0x00a0)\n* L3D_CACHE_ALLOCATE(0x0029)\n* L3D_CACHE_PRFM(0x8151)\n* L3D_CACHE_HWPRF(0x8156)\n* L2D_CACHE_WB(0x0018)" + }, + { + "ArchStdEvent": "LL_CACHE_RD", + "PublicDescription": "This is an alias to the event L3D_CACHE_RD (0x00a0)." + }, + { + "ArchStdEvent": "LL_CACHE_MISS_RD", + "PublicDescription": "This is an alias to the event L3D_CACHE_REFILL_RD (0x00a2)." + }, + { + "ArchStdEvent": "L3D_CACHE_RD", + "PublicDescription": "This event counts each Memory Read operation to L3 D-cache from instruction fetch, Load/Store, and MMU translation table accesses. This does not include hardware prefetcher or PRFM instruction accesses. This include L1 and L2 prefetcher accesses to L3 D-cache." + }, + { + "ArchStdEvent": "L3D_CACHE_REFILL_RD", + "PublicDescription": "This event counts each access counted by both L3D_CACHE_RD and L3D_CACHE_REFILL. That is, every refill of the L3 cache counted by L3D_CACHE_REFILL that is caused by a Memory Read operation.\nThe L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same event in the hardware." + }, + { + "ArchStdEvent": "L3D_CACHE_LMISS_RD", + "PublicDescription": "This event counts each memory Read operation to the L3 cache counted by L3D_CACHE that incurs additional latency because it returns data from outside of the L1 to L3 caches.\nThe L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same event in the hardware." + }, + { + "ArchStdEvent": "L3D_CACHE_RW", + "PublicDescription": "This event counts each access counted by L3D_CACHE that is due to a demand memory Read operation or demand memory Write operation.\nThis event is a sum of L3D_CACHE_RD(0x00a0), L3D_CACHE_ALLOCATE(0x0029) and L2D_CACHE_WB(0x0018).\nNote that this counter does not count that writes an entire line to beyond level 3. Thus this counter does not count the streaming Writes to beyond L3 cache." + }, + { + "ArchStdEvent": "L3D_CACHE_PRFM", + "PublicDescription": "This event counts each access counted by L3D_CACHE that is due to a prefetch instruction. This includes L3 Data accesses due to the L1, L2, or L3 prefetch instruction." + }, + { + "ArchStdEvent": "L3D_CACHE_MISS", + "PublicDescription": "This event counts each demand Read access counted by L3D_CACHE_RD that misses in the L1 to L3 Data, causing an access to outside of the L3 cache.\nThe L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same event in the hardware." + }, + { + "ArchStdEvent": "L3D_CACHE_REFILL_PRFM", + "PublicDescription": "This event counts each access counted by L3D_CACHE_PRFM that causes a refill of the L3 cache, or any L1 or L2 Data, from outside of those caches." + }, + { + "ArchStdEvent": "L3D_CACHE_HWPRF", + "PublicDescription": "This event counts each access to L3 cache that is due to a hardware prefetcher. This includes L3D accesses due to the Level-1 or Level-2 or Level-3 hardware prefetcher." + }, + { + "ArchStdEvent": "L3D_CACHE_REFILL_HWPRF", + "PublicDescription": "This event counts each hardware prefetch counted by L3D_CACHE_HWPRF that causes a refill of the L3 Data or unified cache, or any L1 or L2 Data, Instruction, or unified cache of this PE, from outside of those caches." + }, + { + "ArchStdEvent": "L3D_CACHE_REFILL_PRF", + "PublicDescription": "This event counts each access to L3 cache due to a prefetch instruction, or hardware prefetch that causes a refill of the L3 Data, or any L1 or L2 Data, from outside of those caches." + }, + { + "EventCode": "0x01e8", + "EventName": "L3D_CACHE_RWL1PRFL2PRF", + "PublicDescription": "L3 cache access, demand Read, demand Write, L1 hardware or software prefetch or L2 hardware or software prefetch.\nThis event counts each access to L3 D-cache due to the following:\n* Demand Read or Write.\n* L1 Hardware or software prefetch.\n* L2 Hardware or software prefetch." + }, + { + "EventCode": "0x01e9", + "EventName": "L3D_CACHE_REFILL_RWL1PRFL2PRF", + "PublicDescription": "L3 cache refill, demand Read, demand Write, L1 hardware or software prefetch or L2 hardware or software prefetch.\nThis event counts each access counted by L3D_CACHE_RWL1PRFL2PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x01f6", + "EventName": "L3D_CACHE_REFILL_L2PRF", + "PublicDescription": "This event counts each access counted by L3D_CACHE_L2PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x01f7", + "EventName": "L3D_CACHE_HIT_RWL1PRFL2PRF_FPRF", + "PublicDescription": "L3 cache demand Read, demand Write, L1 prefetch L2 prefetch first hit, fetched by software or hardware prefetch.\nThis event counts each demand Read, demand Write, L1 hardware or software prefetch request and L2 hardware or software prefetch that hit an L3 D-cache line that was refilled into L3 D-cache in response to an L3 hardware prefetch or software prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache." + }, + { + "EventCode": "0x0225", + "EventName": "L3D_CACHE_REFILL_IF", + "PublicDescription": "L3 cache refill, instruction fetch.\nThis event counts demand instruction fetch that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x0226", + "EventName": "L3D_CACHE_REFILL_MM", + "PublicDescription": "L3 cache refill, translation table walk access.\nThis event counts demand translation table access that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x0227", + "EventName": "L3D_CACHE_REFILL_L1PRF", + "PublicDescription": "This event counts each access counted by L3D_CACHE_L1PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches." + }, + { + "EventCode": "0x022c", + "EventName": "L3D_CACHE_L1PRF", + "PublicDescription": "This event counts the L3 D-cache access due to L1 hardware prefetch or software prefetch request.\nThe L1 hardware prefetch or software prefetch requests that miss the L1I, L1D and L2 D-cache are counted by this counter" + }, + { + "EventCode": "0x022d", + "EventName": "L3D_CACHE_L2PRF", + "PublicDescription": "This event counts the L3 D-cache access due to L2 hardware prefetch or software prefetch request.\nThe L2 hardware prefetch or software prefetch requests that miss the L2 D-cache are counted by this counter" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json new file mode 100644 index 0000000000000..becd2d90bf396 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json @@ -0,0 +1,46 @@ +[ + { + "ArchStdEvent": "MEM_ACCESS", + "PublicDescription": "This event counts memory accesses issued by the CPU load/store unit, where those accesses are issued due to load or store operations. This event counts memory accesses regardless of whether the data is received from any level of cache hierarchy or external memory. If memory accesses are broken up into smaller transactions than what were specified in the load or store instructions, then the event counts those smaller memory transactions.\nMemory accesses generated by the following instructions or activity are not counted: instruction fetches, cache maintenance instructions, translation table walks or prefetches, memory prefetch operations. This event counts the sum of the following events:\nMEM_ACCESS_RD and\nMEM_ACCESS_WR." + }, + { + "ArchStdEvent": "MEMORY_ERROR", + "PublicDescription": "This event counts any detected correctable or uncorrectable physical memory errors (ECC or parity) in protected CPU RAMs. On the Core, this event counts errors in the caches (including data and tag RAMs). Any detected memory error (from either a speculative and abandoned access, or an architecturally executed access) is counted.\nNote that errors are only detected when the actual protected memory is accessed by an operation." + }, + { + "ArchStdEvent": "REMOTE_ACCESS", + "PublicDescription": "This event counts each external bus read access that causes an access to a remote device. That is, a socket that does not contain the PE." + }, + { + "ArchStdEvent": "MEM_ACCESS_RD", + "PublicDescription": "This event counts memory accesses issued by the CPU due to Load operations. This event counts any memory Load access, no matter whether the data is received from any level of cache hierarchy or external memory. This event also counts atomic Load operations. If memory accesses are broken up by the Load/Store unit into smaller transactions that are issued by the bus interface, then the event counts those smaller transactions.\nThe following instructions are not counted:\n1) Instruction fetches,\n2) Cache maintenance instructions,\n3) Translation table walks or prefetches,\n4) Memory prefetch operations.\nThis event is a subset of the MEM_ACCESS event but the event only counts memory-Read operations." + }, + { + "ArchStdEvent": "MEM_ACCESS_WR", + "PublicDescription": "This event counts memory accesses issued by the CPU due to Store operations. This event counts any memory Store access, no matter whether the data is located in any level of cache or external memory. This event also counts atomic Load and Store operations. If memory accesses are broken up by the Load/Store unit into smaller transactions that are issued by the bus interface, then the event counts those smaller transactions." + }, + { + "ArchStdEvent": "LDST_ALIGN_LAT", + "PublicDescription": "This event counts the number of memory Read and Write accesses in a cycle that incurred additional latency due to the alignment of the address and the size of data being accessed, which results in a store crossing a single cache line.\nThis event is implemented as the sum of the following events on this CPU:\nLD_ALIGN_LAT and\nST_ALIGN_LAT." + }, + { + "ArchStdEvent": "LD_ALIGN_LAT", + "PublicDescription": "This event counts the number of memory Read accesses in a cycle that incurred additional latency due to the alignment of the address and size of data being accessed, which results in a load crossing a single cache line." + }, + { + "ArchStdEvent": "ST_ALIGN_LAT", + "PublicDescription": "This event counts the number of memory Write accesses in a cycle that incurred additional latency due to the alignment of the address and size of data being accessed." + }, + { + "ArchStdEvent": "INST_FETCH_PERCYC", + "PublicDescription": "This event counts number of instruction fetches outstanding per cycle, which will provide an average latency of instruction fetch." + }, + { + "ArchStdEvent": "MEM_ACCESS_RD_PERCYC", + "PublicDescription": "This event counts the number of outstanding Loads or memory Read accesses per cycle." + }, + { + "ArchStdEvent": "INST_FETCH", + "PublicDescription": "This event counts instruction memory accesses that the PE makes." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json new file mode 100644 index 0000000000000..b825ede03f544 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json @@ -0,0 +1,722 @@ +[ + { + "MetricName": "backend_bound", + "MetricExpr": "100 * (STALL_SLOT_BACKEND / CPU_SLOT)", + "BriefDescription": "This metric is the percentage of total slots that were stalled due to resource constraints in the backend of the processor.", + "ScaleUnit": "1percent of slots", + "MetricGroup": "TopdownL1" + }, + { + "MetricName": "backend_busy_bound", + "MetricExpr": "100 * (STALL_BACKEND_BUSY / STALL_BACKEND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to issue queues being full to accept operations for execution.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_cache_l1d_bound", + "MetricExpr": "100 * (STALL_BACKEND_L1D / (STALL_BACKEND_L1D + STALL_BACKEND_MEM))", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by L1 D-cache misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_cache_l2d_bound", + "MetricExpr": "100 * (STALL_BACKEND_MEM / (STALL_BACKEND_L1D + STALL_BACKEND_MEM))", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by L2 D-cache misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_core_bound", + "MetricExpr": "100 * (STALL_BACKEND_CPUBOUND / STALL_BACKEND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to backend Core resource constraints not related to instruction fetch latency issues caused by memory access components.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_core_rename_bound", + "MetricExpr": "100 * (STALL_BACKEND_RENAME / STALL_BACKEND_CPUBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend as the rename unit registers are unavailable.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_mem_bound", + "MetricExpr": "100 * (STALL_BACKEND_MEMBOUND / STALL_BACKEND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to backend Core resource constraints related to memory access latency issues caused by memory access components.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_mem_cache_bound", + "MetricExpr": "100 * ((STALL_BACKEND_L1D + STALL_BACKEND_MEM) / STALL_BACKEND_MEMBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory latency issues caused by D-cache misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_mem_store_bound", + "MetricExpr": "100 * (STALL_BACKEND_ST / STALL_BACKEND_MEMBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory Write pending caused by Stores stalled in the pre-commit stage.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_mem_tlb_bound", + "MetricExpr": "100 * (STALL_BACKEND_TLB / STALL_BACKEND_MEMBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by Data TLB misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Backend" + }, + { + "MetricName": "backend_stalled_cycles", + "MetricExpr": "100 * (STALL_BACKEND / CPU_CYCLES)", + "BriefDescription": "This metric is the percentage of cycles that were stalled due to resource constraints in the backend unit of the processor.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Cycle_Accounting" + }, + { + "MetricName": "bad_speculation", + "MetricExpr": "100 - (frontend_bound + retiring + backend_bound)", + "BriefDescription": "This metric is the percentage of total slots that executed operations and didn't retire due to a pipeline flush. This indicates cycles that were utilized but inefficiently.", + "ScaleUnit": "1percent of slots", + "MetricGroup": "TopdownL1" + }, + { + "MetricName": "barrier_percentage", + "MetricExpr": "100 * ((ISB_SPEC + DSB_SPEC + DMB_SPEC) / INST_SPEC)", + "BriefDescription": "This metric measures instruction and data barrier operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "branch_direct_ratio", + "MetricExpr": "BR_IMMED_RETIRED / BR_RETIRED", + "BriefDescription": "This metric measures the ratio of direct branches retired to the total number of branches architecturally executed.", + "ScaleUnit": "1per branch", + "MetricGroup": "Branch_Effectiveness" + }, + { + "MetricName": "branch_indirect_ratio", + "MetricExpr": "BR_IND_RETIRED / BR_RETIRED", + "BriefDescription": "This metric measures the ratio of indirect branches retired, including function returns, to the total number of branches architecturally executed.", + "ScaleUnit": "1per branch", + "MetricGroup": "Branch_Effectiveness" + }, + { + "MetricName": "branch_misprediction_ratio", + "MetricExpr": "BR_MIS_PRED_RETIRED / BR_RETIRED", + "BriefDescription": "This metric measures the ratio of branches mispredicted to the total number of branches architecturally executed. This gives an indication of the effectiveness of the branch prediction unit.", + "ScaleUnit": "1per branch", + "MetricGroup": "Miss_Ratio;Branch_Effectiveness" + }, + { + "MetricName": "branch_mpki", + "MetricExpr": "1000 * (BR_MIS_PRED_RETIRED / INST_RETIRED)", + "BriefDescription": "This metric measures the number of branch mispredictions per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;Branch_Effectiveness" + }, + { + "MetricName": "branch_percentage", + "MetricExpr": "100 * ((BR_IMMED_SPEC + BR_INDIRECT_SPEC) / INST_SPEC)", + "BriefDescription": "This metric measures branch operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "branch_return_ratio", + "MetricExpr": "BR_RETURN_RETIRED / BR_RETIRED", + "BriefDescription": "This metric measures the ratio of branches retired that are function returns to the total number of branches architecturally executed.", + "ScaleUnit": "1per branch", + "MetricGroup": "Branch_Effectiveness" + }, + { + "MetricName": "bus_bandwidth", + "MetricExpr": "BUS_ACCESS * 32 / duration_time ", + "BriefDescription": "This metric measures the bus-bandwidth of the data transferred between this PE's L2 with unCore in the system.", + "ScaleUnit": "1Bytes/sec" + }, + { + "MetricName": "cpu_cycles_fraction_in_st_mode", + "MetricExpr": "((CPU_SLOT/CPU_CYCLES) - 5) / 5", + "BriefDescription": "This metric counts fraction of the CPU cycles spent in ST mode during program execution.", + "ScaleUnit": "1fraction of cycles", + "MetricGroup": "SMT" + }, + { + "MetricName": "cpu_cycles_in_smt_mode", + "MetricExpr": "(1 - cpu_cycles_fraction_in_st_mode) * CPU_CYCLES", + "BriefDescription": "This metric counts CPU cycles in SMT mode during program execution.", + "ScaleUnit": "1CPU cycles", + "MetricGroup": "SMT" + }, + { + "MetricName": "cpu_cycles_in_st_mode", + "MetricExpr": "cpu_cycles_fraction_in_st_mode * CPU_CYCLES", + "BriefDescription": "This metric counts CPU cycles in ST mode during program execution.", + "ScaleUnit": "1CPU cycles", + "MetricGroup": "SMT" + }, + { + "MetricName": "crypto_percentage", + "MetricExpr": "100 * (CRYPTO_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures crypto operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "dtlb_mpki", + "MetricExpr": "1000 * (DTLB_WALK / INST_RETIRED)", + "BriefDescription": "This metric measures the number of Data TLB Walks per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;DTLB_Effectiveness" + }, + { + "MetricName": "dtlb_walk_average_latency", + "MetricExpr": "DTLB_WALK_PERCYC / DTLB_WALK", + "BriefDescription": "This metric measures the average latency of Data TLB walks in CPU cycles.", + "ScaleUnit": "1CPU cycles", + "MetricGroup": "Average_Latency" + }, + { + "MetricName": "dtlb_walk_ratio", + "MetricExpr": "DTLB_WALK / L1D_TLB", + "BriefDescription": "This metric measures the ratio of Data TLB Walks to the total number of Data TLB accesses. This gives an indication of the effectiveness of the Data TLB accesses.", + "ScaleUnit": "1per TLB access", + "MetricGroup": "Miss_Ratio;DTLB_Effectiveness" + }, + { + "MetricName": "fp16_percentage", + "MetricExpr": "100 * (FP_HP_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures half-precision floating point operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "FP_Precision_Mix" + }, + { + "MetricName": "fp32_percentage", + "MetricExpr": "100 * (FP_SP_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures single-precision floating point operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "FP_Precision_Mix" + }, + { + "MetricName": "fp64_percentage", + "MetricExpr": "100 * (FP_DP_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures double-precision floating point operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "FP_Precision_Mix" + }, + { + "MetricName": "fp_ops_per_cycle", + "MetricExpr": "(FP_SCALE_OPS_SPEC + FP_FIXED_OPS_SPEC) / CPU_CYCLES", + "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by any instruction. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.", + "ScaleUnit": "1operations per cycle", + "MetricGroup": "FP_Arithmetic_Intensity" + }, + { + "MetricName": "frontend_bound", + "MetricExpr": "100 * (STALL_SLOT_FRONTEND_WITHOUT_MISPRED / CPU_SLOT)", + "BriefDescription": "This metric is the percentage of total slots that were stalled due to resource constraints in the frontend of the processor.", + "ScaleUnit": "1percent of slots", + "MetricGroup": "TopdownL1" + }, + { + "MetricName": "frontend_cache_l1i_bound", + "MetricExpr": "100 * (STALL_FRONTEND_L1I / (STALL_FRONTEND_L1I + STALL_FRONTEND_MEM))", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to memory access latency issues caused by L1 I-cache misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_cache_l2i_bound", + "MetricExpr": "100 * (STALL_FRONTEND_MEM / (STALL_FRONTEND_L1I + STALL_FRONTEND_MEM))", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to memory access latency issues caused by L2 I-cache misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_core_bound", + "MetricExpr": "100 * (STALL_FRONTEND_CPUBOUND / STALL_FRONTEND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to frontend Core resource constraints not related to instruction fetch latency issues caused by memory access components.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_core_flow_bound", + "MetricExpr": "100 * (STALL_FRONTEND_FLOW / STALL_FRONTEND_CPUBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend as the decode unit is awaiting input from the branch prediction unit.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_core_flush_bound", + "MetricExpr": "100 * (STALL_FRONTEND_FLUSH / STALL_FRONTEND_CPUBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend as the processor is recovering from a pipeline flush caused by bad speculation or other machine resteers.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_mem_bound", + "MetricExpr": "100 * (STALL_FRONTEND_MEMBOUND / STALL_FRONTEND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to frontend Core resource constraints related to the instruction fetch latency issues caused by memory access components.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_mem_cache_bound", + "MetricExpr": "100 * ((STALL_FRONTEND_L1I + STALL_FRONTEND_MEM) / STALL_FRONTEND_MEMBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to instruction fetch latency issues caused by I-cache misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_mem_tlb_bound", + "MetricExpr": "100 * (STALL_FRONTEND_TLB / STALL_FRONTEND_MEMBOUND)", + "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to instruction fetch latency issues caused by Instruction TLB misses.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Topdown_Frontend" + }, + { + "MetricName": "frontend_stalled_cycles", + "MetricExpr": "100 * (STALL_FRONTEND / CPU_CYCLES)", + "BriefDescription": "This metric is the percentage of cycles that were stalled due to resource constraints in the frontend unit of the processor.", + "ScaleUnit": "1percent of cycles", + "MetricGroup": "Cycle_Accounting" + }, + { + "MetricName": "instruction_fetch_average_latency", + "MetricExpr": "INST_FETCH_PERCYC / INST_FETCH", + "BriefDescription": "This metric measures the average latency of instruction fetches in CPU cycles.", + "ScaleUnit": "1CPU cycles", + "MetricGroup": "Average_Latency" + }, + { + "MetricName": "integer_dp_percentage", + "MetricExpr": "100 * (DP_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures scalar integer operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "ipc", + "MetricExpr": "INST_RETIRED / CPU_CYCLES", + "BriefDescription": "This metric measures the number of instructions retired per cycle.", + "ScaleUnit": "1per cycle", + "MetricGroup": "General" + }, + { + "MetricName": "itlb_mpki", + "MetricExpr": "1000 * (ITLB_WALK / INST_RETIRED)", + "BriefDescription": "This metric measures the number of instruction TLB Walks per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;ITLB_Effectiveness" + }, + { + "MetricName": "itlb_walk_average_latency", + "MetricExpr": "ITLB_WALK_PERCYC / ITLB_WALK", + "BriefDescription": "This metric measures the average latency of instruction TLB walks in CPU cycles.", + "ScaleUnit": "1CPU cycles", + "MetricGroup": "Average_Latency" + }, + { + "MetricName": "itlb_walk_ratio", + "MetricExpr": "ITLB_WALK / L1I_TLB", + "BriefDescription": "This metric measures the ratio of instruction TLB Walks to the total number of Instruction TLB accesses. This gives an indication of the effectiveness of the Instruction TLB accesses.", + "ScaleUnit": "1per TLB access", + "MetricGroup": "Miss_Ratio;ITLB_Effectiveness" + }, + { + "MetricName": "l1d_cache_miss_ratio", + "MetricExpr": "L1D_CACHE_REFILL / L1D_CACHE", + "BriefDescription": "This metric measures the ratio of L1 D-cache accesses missed to the total number of L1 D-cache accesses. This gives an indication of the effectiveness of the L1 D-cache.", + "ScaleUnit": "1per cache access", + "MetricGroup": "Miss_Ratio;L1D_Cache_Effectiveness" + }, + { + "MetricName": "l1d_cache_mpki", + "MetricExpr": "1000 * (L1D_CACHE_REFILL / INST_RETIRED)", + "BriefDescription": "This metric measures the number of L1 D-cache accesses missed per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;L1D_Cache_Effectiveness" + }, + { + "MetricName": "l1d_cache_rw_miss_ratio", + "MetricExpr": "l1d_demand_misses / l1d_demand_accesses", + "BriefDescription": "This metric measures the ratio of L1 D-cache Read accesses missed to the total number of L1 D-cache accesses. This gives an indication of the effectiveness of the L1 D-cache for demand Load or Store traffic.", + "ScaleUnit": "1per cache access", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1d_demand_accesses", + "MetricExpr": "L1D_CACHE_RW", + "BriefDescription": "This metric measures the count of L1 D-cache accesses incurred on Load or Store by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1d_demand_misses", + "MetricExpr": "L1D_CACHE_REFILL_RW", + "BriefDescription": "This metric measures the count of L1 D-cache misses incurred on a Load or Store by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1d_prf_accuracy", + "MetricExpr": "100 * (l1d_useful_prf / l1d_refilled_prf)", + "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.", + "ScaleUnit": "1percent of prefetch", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1d_prf_coverage", + "MetricExpr": "100 * (l1d_useful_prf / (l1d_demand_misses + l1d_refilled_prf))", + "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", + "ScaleUnit": "1percent of cache access", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1d_refilled_prf", + "MetricExpr": "L1D_CACHE_REFILL_HWPRF + L1D_CACHE_REFILL_PRFM + L1D_LFB_HIT_RW_FHWPRF + L1D_LFB_HIT_RW_FPRFM", + "BriefDescription": "This metric measures the count of cache lines refilled by L1 data prefetcher (hardware prefetches or software preload) into L1 D-cache.", + "ScaleUnit": "1count", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1d_tlb_miss_ratio", + "MetricExpr": "L1D_TLB_REFILL / L1D_TLB", + "BriefDescription": "This metric measures the ratio of L1 Data TLB accesses missed to the total number of L1 Data TLB accesses. This gives an indication of the effectiveness of the L1 Data TLB.", + "ScaleUnit": "1per TLB access", + "MetricGroup": "Miss_Ratio;DTLB_Effectiveness" + }, + { + "MetricName": "l1d_tlb_mpki", + "MetricExpr": "1000 * (L1D_TLB_REFILL / INST_RETIRED)", + "BriefDescription": "This metric measures the number of L1 Data TLB accesses missed per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;DTLB_Effectiveness" + }, + { + "MetricName": "l1d_useful_prf", + "MetricExpr": "L1D_CACHE_HIT_RW_FPRF + L1D_LFB_HIT_RW_FHWPRF + L1D_LFB_HIT_RW_FPRFM", + "BriefDescription": "This metric measures the count of cache lines refilled by L1 data prefetcher (hardware prefetches or software preload) into L1 D-cache which are further used by Load or Store from the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L1I_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_cache_miss_ratio", + "MetricExpr": "L1I_CACHE_REFILL / L1I_CACHE", + "BriefDescription": "This metric measures the ratio of L1 I-cache accesses missed to the total number of L1 I-cache accesses. This gives an indication of the effectiveness of the L1 I-cache.", + "ScaleUnit": "1per cache access", + "MetricGroup": "Miss_Ratio;L1I_Cache_Effectiveness" + }, + { + "MetricName": "l1i_cache_mpki", + "MetricExpr": "1000 * (L1I_CACHE_REFILL / INST_RETIRED)", + "BriefDescription": "This metric measures the number of L1 I-cache accesses missed per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;L1I_Cache_Effectiveness" + }, + { + "MetricName": "l1i_cache_rd_miss_ratio", + "MetricExpr": "l1i_demand_misses / l1i_demand_accesses", + "BriefDescription": "This metric measures the ratio of L1 I-cache Read accesses missed to the total number of L1 I-cache accesses. This gives an indication of the effectiveness of the L1 I-cache for demand instruction fetch traffic. Note that cache accesses in this cache are demand instruction fetch.", + "ScaleUnit": "1per cache access", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_demand_accesses", + "MetricExpr": "L1I_CACHE_RD", + "BriefDescription": "This metric measures the count of L1 I-cache accesses caused by an instruction fetch by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_demand_misses", + "MetricExpr": "L1I_CACHE_REFILL_RD", + "BriefDescription": "This metric measures the count of L1 I-cache misses caused by an instruction fetch by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_prf_accuracy", + "MetricExpr": "100 * (l1i_useful_prf / l1i_refilled_prf)", + "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.", + "ScaleUnit": "1percent of prefetch", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_prf_coverage", + "MetricExpr": "100 * (l1i_useful_prf / (l1i_demand_misses + l1i_refilled_prf))", + "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", + "ScaleUnit": "1percent of cache access", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_refilled_prf", + "MetricExpr": "L1I_CACHE_REFILL_HWPRF + L1I_CACHE_REFILL_PRFM", + "BriefDescription": "This metric measures the count of cache lines refilled by L1 instruction prefetcher (hardware prefetches or software preload) into L1 I-cache.", + "ScaleUnit": "1count", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l1i_tlb_miss_ratio", + "MetricExpr": "L1I_TLB_REFILL / L1I_TLB", + "BriefDescription": "This metric measures the ratio of L1 Instruction TLB accesses missed to the total number of L1 Instruction TLB accesses. This gives an indication of the effectiveness of the L1 Instruction TLB.", + "ScaleUnit": "1per TLB access", + "MetricGroup": "Miss_Ratio;ITLB_Effectiveness" + }, + { + "MetricName": "l1i_tlb_mpki", + "MetricExpr": "1000 * (L1I_TLB_REFILL / INST_RETIRED)", + "BriefDescription": "This metric measures the number of L1 Instruction TLB accesses missed per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;ITLB_Effectiveness" + }, + { + "MetricName": "l1i_useful_prf", + "MetricExpr": "L1I_CACHE_HIT_RD_FPRF", + "BriefDescription": "This metric measures the count of cache lines refilled by L1 instruction prefetcher (hardware prefetches or software preload) into L1 I-cache which are further used by instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L1D_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2_cache_miss_ratio", + "MetricExpr": "L2D_CACHE_REFILL / L2D_CACHE", + "BriefDescription": "This metric measures the ratio of L2 cache accesses missed to the total number of L2 cache accesses. This gives an indication of the effectiveness of the L2 cache, which is a unified cache that stores both data and instruction.\nNote that cache accesses in this cache are either data memory access or instruction fetch as this is a unified cache.", + "ScaleUnit": "1per cache access", + "MetricGroup": "Miss_Ratio;L2_Cache_Effectiveness" + }, + { + "MetricName": "l2_cache_mpki", + "MetricExpr": "1000 * (l2d_demand_misses / INST_RETIRED)", + "BriefDescription": "This metric measures the number of L2 unified cache accesses missed per thousand instructions executed.\nNote that cache accesses in this cache are either data memory access or instruction fetch as this is a unified cache.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;L2_Cache_Effectiveness" + }, + { + "MetricName": "l2_tlb_miss_ratio", + "MetricExpr": "L2D_TLB_REFILL / L2D_TLB", + "BriefDescription": "This metric measures the ratio of L2 unified TLB accesses missed to the total number of L2 unified TLB accesses.\nThis gives an indication of the effectiveness of the L2 TLB.", + "ScaleUnit": "1per TLB access", + "MetricGroup": "Miss_Ratio;ITLB_Effectiveness;DTLB_Effectiveness" + }, + { + "MetricName": "l2_tlb_mpki", + "MetricExpr": "1000 * (L2D_TLB_REFILL / INST_RETIRED)", + "BriefDescription": "This metric measures the number of L2 unified TLB accesses missed per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;ITLB_Effectiveness;DTLB_Effectiveness" + }, + { + "MetricName": "l2d_cache_rwl1prf_miss_ratio", + "MetricExpr": "l2d_demand_misses / l2d_demand_accesses", + "BriefDescription": "This metric measures the ratio of L2 D-cache Read accesses missed to the total number of L2 D-cache accesses.\nThis gives an indication of the effectiveness of the L2 D-cache for demand instruction fetch, Load, Store, or L1 prefetcher accesses traffic.", + "ScaleUnit": "1per cache access", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2d_demand_accesses", + "MetricExpr": "L2D_CACHE_RD + L2D_CACHE_WR + L2D_CACHE_L1PRF", + "BriefDescription": "This metric measures the count of L2 D-cache accesses incurred on an instruction fetch, Load, Store, or L1 prefetcher accesses by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2d_demand_misses", + "MetricExpr": "L2D_CACHE_REFILL_RD + L2D_CACHE_REFILL_WR + L2D_CACHE_REFILL_L1PRF", + "BriefDescription": "This metric measures the count of L2 D-cache misses incurred on an instruction fetch, Load, Store, or L1 prefetcher accesses by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2d_prf_accuracy", + "MetricExpr": "100 * (l2d_useful_prf / l2d_refilled_prf)", + "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.", + "ScaleUnit": "1percent of prefetch", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2d_prf_coverage", + "MetricExpr": "100 * (l2d_useful_prf / (l2d_demand_misses + l2d_refilled_prf))", + "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", + "ScaleUnit": "1percent of cache access", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2d_refilled_prf", + "MetricExpr": "(L2D_CACHE_REFILL_PRF - L2D_CACHE_REFILL_L1PRF) + L2D_LFB_HIT_RWL1PRF_FHWPRF", + "BriefDescription": "This metric measures the count of cache lines refilled by L2 data prefetcher (hardware prefetches or software preload) into L2 D-cache.", + "ScaleUnit": "1count", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l2d_useful_prf", + "MetricExpr": "L2D_CACHE_HIT_RWL1PRF_FPRF + L2D_LFB_HIT_RWL1PRF_FHWPRF", + "BriefDescription": "This metric measures the count of cache lines refilled by L2 data prefetcher (hardware prefetches or software preload) into L2 D-cache which are further used by instruction fetch, Load, Store, or L1 prefetcher accesses from the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L2_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_cache_rwl1prfl2prf_miss_ratio", + "MetricExpr": "l3d_demand_misses / l3d_demand_accesses", + "BriefDescription": "This metric measures the ratio of L3 D-cache Read accesses missed to the total number of L3 D-cache accesses. This gives an indication of the effectiveness of the L2 D-cache for demand instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses traffic.", + "ScaleUnit": "1per cache access", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_demand_accesses", + "MetricExpr": "L3D_CACHE_RWL1PRFL2PRF", + "BriefDescription": "This metric measures the count of L3 D-cache accesses incurred on an instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_demand_misses", + "MetricExpr": "L3D_CACHE_REFILL_RWL1PRFL2PRF", + "BriefDescription": "This metric measures the count of L3 D-cache misses incurred on an instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses by the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_prf_accuracy", + "MetricExpr": "100 * (l3d_useful_prf / l3d_refilled_prf)", + "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.", + "ScaleUnit": "1percent of prefetch", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_prf_coverage", + "MetricExpr": "100 * (l3d_useful_prf / (l3d_demand_misses + l3d_refilled_prf))", + "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.", + "ScaleUnit": "1percent of cache access", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_refilled_prf", + "MetricExpr": "L3D_CACHE_REFILL_HWPRF + L3D_CACHE_REFILL_PRFM - L3D_CACHE_REFILL_L1PRF - L3D_CACHE_REFILL_L2PRF", + "BriefDescription": "This metric measures the count of cache lines refilled by L3 data prefetcher (hardware prefetches or software preload) into L3 D-cache.", + "ScaleUnit": "1count", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "l3d_useful_prf", + "MetricExpr": "L3D_CACHE_HIT_RWL1PRFL2PRF_FPRF", + "BriefDescription": "This metric measures the count of cache lines refilled by L3 data prefetcher (hardware prefetches or software preload) into L3 D-cache which are further used by instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses from the instruction stream of the program.", + "ScaleUnit": "1count", + "MetricGroup": "L3_Prefetcher_Effectiveness" + }, + { + "MetricName": "ll_cache_read_hit_ratio", + "MetricExpr": "(LL_CACHE_RD - LL_CACHE_MISS_RD) / LL_CACHE_RD", + "BriefDescription": "This metric measures the ratio of last level cache Read accesses hit in the cache to the total number of last level cache accesses. This gives an indication of the effectiveness of the last level cache for Read traffic. Note that cache accesses in this cache are either data memory access or instruction fetch as this is a system level cache.", + "ScaleUnit": "1per cache access", + "MetricGroup": "LL_Cache_Effectiveness" + }, + { + "MetricName": "ll_cache_read_miss_ratio", + "MetricExpr": "LL_CACHE_MISS_RD / LL_CACHE_RD", + "BriefDescription": "This metric measures the ratio of last level cache Read accesses missed to the total number of last level cache accesses. This gives an indication of the effectiveness of the last level cache for Read traffic. Note that cache accesses in this cache are either data memory access or instruction fetch as this is a system level cache.", + "ScaleUnit": "1per cache access", + "MetricGroup": "Miss_Ratio;LL_Cache_Effectiveness" + }, + { + "MetricName": "ll_cache_read_mpki", + "MetricExpr": "1000 * (LL_CACHE_MISS_RD / INST_RETIRED)", + "BriefDescription": "This metric measures the number of last level cache Read accesses missed per thousand instructions executed.", + "ScaleUnit": "1MPKI", + "MetricGroup": "MPKI;LL_Cache_Effectiveness" + }, + { + "MetricName": "load_average_latency", + "MetricExpr": "MEM_ACCESS_RD_PERCYC / MEM_ACCESS", + "BriefDescription": "This metric measures the average latency of Load operations in CPU cycles.", + "ScaleUnit": "1CPU cycles", + "MetricGroup": "Average_Latency" + }, + { + "MetricName": "load_percentage", + "MetricExpr": "100 * (LD_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures Load operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "nonsve_fp_ops_per_cycle", + "MetricExpr": "FP_FIXED_OPS_SPEC / CPU_CYCLES", + "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by an instruction that is not an SVE instruction. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.", + "ScaleUnit": "1operations per cycle", + "MetricGroup": "FP_Arithmetic_Intensity" + }, + { + "MetricName": "retiring", + "MetricExpr": "100 * ((OP_RETIRED/OP_SPEC) * (1 - (STALL_SLOT/CPU_SLOT)))", + "BriefDescription": "This metric is the percentage of total slots that retired operations, which indicates cycles that were utilized efficiently.", + "ScaleUnit": "1percent of slots", + "MetricGroup": "TopdownL1" + }, + { + "MetricName": "scalar_fp_percentage", + "MetricExpr": "100 * (VFP_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures scalar floating point operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "simd_percentage", + "MetricExpr": "100 * (ASE_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures advanced SIMD operations as a percentage of total operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "store_percentage", + "MetricExpr": "100 * (ST_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures Store operations as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "sve_all_percentage", + "MetricExpr": "100 * (SVE_INST_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures scalable vector operations, including Loads and Stores, as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "Operation_Mix" + }, + { + "MetricName": "sve_fp_ops_per_cycle", + "MetricExpr": "FP_SCALE_OPS_SPEC / CPU_CYCLES", + "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by SVE instructions. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.", + "ScaleUnit": "1operations per cycle", + "MetricGroup": "FP_Arithmetic_Intensity" + }, + { + "MetricName": "sve_predicate_empty_percentage", + "MetricExpr": "100 * (SVE_PRED_EMPTY_SPEC / SVE_PRED_SPEC)", + "BriefDescription": "This metric measures scalable vector operations with no active predicates as a percentage of SVE predicated operations speculatively executed.", + "ScaleUnit": "1percent of SVE predicated operations", + "MetricGroup": "SVE_Effectiveness" + }, + { + "MetricName": "sve_predicate_full_percentage", + "MetricExpr": "100 * (SVE_PRED_FULL_SPEC / SVE_PRED_SPEC)", + "BriefDescription": "This metric measures scalable vector operations with all active predicates as a percentage of SVE predicated operations speculatively executed.", + "ScaleUnit": "1percent of SVE predicated operations", + "MetricGroup": "SVE_Effectiveness" + }, + { + "MetricName": "sve_predicate_partial_percentage", + "MetricExpr": "100 * (SVE_PRED_PARTIAL_SPEC / SVE_PRED_SPEC)", + "BriefDescription": "This metric measures scalable vector operations with at least one active predicates as a percentage of SVE predicated operations speculatively executed.", + "ScaleUnit": "1percent of SVE predicated operations", + "MetricGroup": "SVE_Effectiveness" + }, + { + "MetricName": "sve_predicate_percentage", + "MetricExpr": "100 * (SVE_PRED_SPEC / INST_SPEC)", + "BriefDescription": "This metric measures scalable vector operations with predicates as a percentage of operations speculatively executed.", + "ScaleUnit": "1percent of operations", + "MetricGroup": "SVE_Effectiveness" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json new file mode 100644 index 0000000000000..8ff87d844e521 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json @@ -0,0 +1,642 @@ +[ + { + "ArchStdEvent": "SW_INCR", + "PublicDescription": "This event counts software writes to the PMSWINC_EL0 (software PMU increment) register. The PMSWINC_EL0 register is a manually updated counter for use by application software.\nThis event could be used to measure any user program event, such as accesses to a particular data structure (by writing to the PMSWINC_EL0 register each time the data structure is accessed).\nTo use the PMSWINC_EL0 register and event, developers must insert instructions that write to the PMSWINC_EL0 register into the source code.\nSince the SW_INCR event records writes to the PMSWINC_EL0 register, there is no need to do a Read/Increment/Write sequence to the PMSWINC_EL0 register." + }, + { + "ArchStdEvent": "TRB_WRAP", + "PublicDescription": "This event is generated each time the trace buffer current Write pointer is wrapped to the trace buffer base pointer." + }, + { + "ArchStdEvent": "TRCEXTOUT0", + "PublicDescription": "Trace unit external output 0." + }, + { + "ArchStdEvent": "TRCEXTOUT1", + "PublicDescription": "Trace unit external output 1." + }, + { + "ArchStdEvent": "TRCEXTOUT2", + "PublicDescription": "Trace unit external output 2." + }, + { + "ArchStdEvent": "TRCEXTOUT3", + "PublicDescription": "Trace unit external output 3." + }, + { + "ArchStdEvent": "CTI_TRIGOUT4", + "PublicDescription": "Cross-trigger Interface output trigger 4." + }, + { + "ArchStdEvent": "CTI_TRIGOUT5", + "PublicDescription": "Cross-trigger Interface output trigger 5." + }, + { + "ArchStdEvent": "CTI_TRIGOUT6", + "PublicDescription": "Cross-trigger Interface output trigger 6." + }, + { + "ArchStdEvent": "CTI_TRIGOUT7", + "PublicDescription": "Cross-trigger Interface output trigger 7." + }, + { + "EventCode": "0x00e1", + "EventName": "L1I_PRFM_REQ_DROP", + "PublicDescription": "L1 I-cache software prefetch dropped." + }, + { + "EventCode": "0x0100", + "EventName": "L1_PF_REFILL", + "PublicDescription": "L1 prefetch requests, refilled to L1 cache." + }, + { + "EventCode": "0x0120", + "EventName": "FLUSH", + "PublicDescription": "This event counts both the CT flush and BX flush. The BR_MIS_PRED counts the BX flushes. So the FLUSH-BR_MIS_PRED gives the CT flushes." + }, + { + "EventCode": "0x0121", + "EventName": "FLUSH_MEM", + "PublicDescription": "Flushes due to memory hazards. This only includes CT flushes." + }, + { + "EventCode": "0x0122", + "EventName": "FLUSH_BAD_BRANCH", + "PublicDescription": "Flushes due to bad predicted branch. This only includes CT flushes." + }, + { + "EventCode": "0x0123", + "EventName": "FLUSH_STDBYPASS", + "PublicDescription": "Flushes due to bad predecode. This only includes CT flushes." + }, + { + "EventCode": "0x0124", + "EventName": "FLUSH_ISB", + "PublicDescription": "Flushes due to ISB or similar side-effects. This only includes CT flushes." + }, + { + "EventCode": "0x0125", + "EventName": "FLUSH_OTHER", + "PublicDescription": "Flushes due to other hazards. This only includes CT flushes." + }, + { + "EventCode": "0x0126", + "EventName": "STORE_STREAM", + "PublicDescription": "Stored lines in streaming no-Write-allocate mode." + }, + { + "EventCode": "0x0127", + "EventName": "NUKE_RAR", + "PublicDescription": "Load/Store nuke due to Read-after-Read ordering hazard." + }, + { + "EventCode": "0x0128", + "EventName": "NUKE_RAW", + "PublicDescription": "Load/Store nuke due to Read-after-Write ordering hazard." + }, + { + "EventCode": "0x0129", + "EventName": "L1_PF_GEN_PAGE", + "PublicDescription": "Load/Store prefetch to L1 generated, Page mode." + }, + { + "EventCode": "0x012a", + "EventName": "L1_PF_GEN_STRIDE", + "PublicDescription": "Load/Store prefetch to L1 generated, stride mode." + }, + { + "EventCode": "0x012b", + "EventName": "L2_PF_GEN_LD", + "PublicDescription": "Load prefetch to L2 generated." + }, + { + "EventCode": "0x012d", + "EventName": "LS_PF_TRAIN_TABLE_ALLOC", + "PublicDescription": "LS prefetch train table entry allocated." + }, + { + "EventCode": "0x0130", + "EventName": "LS_PF_GEN_TABLE_ALLOC", + "PublicDescription": "This event counts the number of cycles with at least one table allocation, for L2 hardware prefetches (including the software PRFM instructions that are converted into hardware prefetches due to D-TLB miss).\nLS prefetch gen table allocation (for L2 prefetches)." + }, + { + "EventCode": "0x0131", + "EventName": "LS_PF_GEN_TABLE_ALLOC_PF_PEND", + "PublicDescription": "This event counts the number of cycles in which at least one hardware prefetch is dropped due to the inability to identify a victim when the generation table is full. The hardware prefetch considered here includes the software PRFM that is converted into hardware prefetches due to D-TLB miss." + }, + { + "EventCode": "0x0132", + "EventName": "TBW", + "PublicDescription": "Tablewalks." + }, + { + "EventCode": "0x0134", + "EventName": "S1L2_HIT", + "PublicDescription": "Translation cache hit on S1L2 walk cache entry." + }, + { + "EventCode": "0x0135", + "EventName": "S1L1_HIT", + "PublicDescription": "Translation cache hit on S1L1 walk cache entry." + }, + { + "EventCode": "0x0136", + "EventName": "S1L0_HIT", + "PublicDescription": "Translation cache hit on S1L0 walk cache entry." + }, + { + "EventCode": "0x0137", + "EventName": "S2L2_HIT", + "PublicDescription": "Translation cache hit for S2L2 IPA walk cache entry." + }, + { + "EventCode": "0x0138", + "EventName": "IPA_REQ", + "PublicDescription": "Translation cache lookups for IPA to PA entries." + }, + { + "EventCode": "0x0139", + "EventName": "IPA_REFILL", + "PublicDescription": "Translation cache refills for IPA to PA entries." + }, + { + "EventCode": "0x013a", + "EventName": "S1_FLT", + "PublicDescription": "Stage1 tablewalk fault." + }, + { + "EventCode": "0x013b", + "EventName": "S2_FLT", + "PublicDescription": "Stage2 tablewalk fault." + }, + { + "EventCode": "0x013c", + "EventName": "COLT_REFILL", + "PublicDescription": "Aggregated page refill." + }, + { + "EventCode": "0x0145", + "EventName": "L1_PF_HIT", + "PublicDescription": "L1 prefetch requests, hitting in L1 cache." + }, + { + "EventCode": "0x0146", + "EventName": "L1_PF", + "PublicDescription": "L1 prefetch requests." + }, + { + "EventCode": "0x0147", + "EventName": "CACHE_LS_REFILL", + "PublicDescription": "L2 D-cache refill, Load/Store." + }, + { + "EventCode": "0x0148", + "EventName": "CACHE_PF", + "PublicDescription": "L2 prefetch requests." + }, + { + "EventCode": "0x0149", + "EventName": "CACHE_PF_HIT", + "PublicDescription": "L2 prefetch requests, hitting in L2 cache." + }, + { + "EventCode": "0x0150", + "EventName": "UNUSED_PF", + "PublicDescription": "L2 unused prefetch." + }, + { + "EventCode": "0x0151", + "EventName": "PFT_SENT", + "PublicDescription": "L2 prefetch TGT sent.\nNote that PFT_SENT != PFT_USEFUL + PFT_DROP. There may be PFT_SENT for which the accesses resulted in a SLC hit." + }, + { + "EventCode": "0x0152", + "EventName": "PFT_USEFUL", + "PublicDescription": "L2 prefetch TGT useful." + }, + { + "EventCode": "0x0153", + "EventName": "PFT_DROP", + "PublicDescription": "L2 prefetch TGT dropped." + }, + { + "EventCode": "0x0162", + "EventName": "LRQ_FULL", + "PublicDescription": "This event counts the number of cycles the LRQ is full." + }, + { + "EventCode": "0x0163", + "EventName": "FETCH_FQ_EMPTY", + "PublicDescription": "Fetch Queue empty cycles." + }, + { + "EventCode": "0x0164", + "EventName": "FPG2", + "PublicDescription": "Forward progress guarantee. Medium range livelock triggered." + }, + { + "EventCode": "0x0165", + "EventName": "FPG", + "PublicDescription": "Forward progress guarantee. Tofu global livelock buster is triggered." + }, + { + "EventCode": "0x0172", + "EventName": "DEADBLOCK", + "PublicDescription": "Write-back evictions converted to dataless EVICT.\nThe victim line is deemed deadblock if the likeliness of a reuse is low. The Core uses dataless evict to evict a deadblock; and it uses an evict with data to evict an L2 line that is not a deadblock." + }, + { + "EventCode": "0x0173", + "EventName": "PF_PRQ_ALLOC_PF_PEND", + "PublicDescription": "L1 prefetch prq allocation (replacing pending)." + }, + { + "EventCode": "0x0178", + "EventName": "FETCH_ICACHE_INSTR", + "PublicDescription": "Instructions fetched from I-cache." + }, + { + "EventCode": "0x017b", + "EventName": "NEAR_CAS", + "PublicDescription": "Near atomics: compare and swap." + }, + { + "EventCode": "0x017c", + "EventName": "NEAR_CAS_PASS", + "PublicDescription": "Near atomics: compare and swap pass." + }, + { + "EventCode": "0x017d", + "EventName": "FAR_CAS", + "PublicDescription": "Far atomics: compare and swap." + }, + { + "EventCode": "0x0186", + "EventName": "L2_BTB_RELOAD_MAIN_BTB", + "PublicDescription": "Number of completed L1 BTB update initiated by L2 BTB hit which swap branch information between L1 BTB and L2 BTB." + }, + { + "EventCode": "0x018f", + "EventName": "L1_PF_GEN_MCMC", + "PublicDescription": "Load/Store prefetch to L1 generated, MCMC." + }, + { + "EventCode": "0x0190", + "EventName": "PF_MODE_0_CYCLES", + "PublicDescription": "Number of cycles in which the hardware prefetcher is in the most aggressive mode." + }, + { + "EventCode": "0x0191", + "EventName": "PF_MODE_1_CYCLES", + "PublicDescription": "Number of cycles in which the hardware prefetcher is in the more aggressive mode." + }, + { + "EventCode": "0x0192", + "EventName": "PF_MODE_2_CYCLES", + "PublicDescription": "Number of cycles in which the hardware prefetcher is in the less aggressive mode." + }, + { + "EventCode": "0x0193", + "EventName": "PF_MODE_3_CYCLES", + "PublicDescription": "Number of cycles in which the hardware prefetcher is in the most conservative mode." + }, + { + "EventCode": "0x0194", + "EventName": "TXREQ_LIMIT_MAX_CYCLES", + "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is the L2_TQ_SIZE." + }, + { + "EventCode": "0x0195", + "EventName": "TXREQ_LIMIT_3QUARTER_CYCLES", + "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 3/4 of the L2_TQ_SIZE and the L2_TQ_SIZE-1." + }, + { + "EventCode": "0x0196", + "EventName": "TXREQ_LIMIT_HALF_CYCLES", + "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 1/2 of the L2_TQ_SIZE and 3/4 of the L2_TQ_SIZE." + }, + { + "EventCode": "0x0197", + "EventName": "TXREQ_LIMIT_1QUARTER_CYCLES", + "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 1/4 of the L2_TQ_SIZE and 1/2 of the L2_TQ_SIZE." + }, + { + "EventCode": "0x019d", + "EventName": "PREFETCH_LATE_CMC", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by CMC prefetch request." + }, + { + "EventCode": "0x019e", + "EventName": "PREFETCH_LATE_BO", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by BO prefetch request." + }, + { + "EventCode": "0x019f", + "EventName": "PREFETCH_LATE_STRIDE", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by STRIDE prefetch request." + }, + { + "EventCode": "0x01a0", + "EventName": "PREFETCH_LATE_SPATIAL", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by SPATIAL prefetch request." + }, + { + "EventCode": "0x01a2", + "EventName": "PREFETCH_LATE_TBW", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by TBW prefetch request." + }, + { + "EventCode": "0x01a3", + "EventName": "PREFETCH_LATE_PAGE", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by PAGE prefetch request." + }, + { + "EventCode": "0x01a4", + "EventName": "PREFETCH_LATE_GSMS", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by GSMS prefetch request." + }, + { + "EventCode": "0x01a5", + "EventName": "PREFETCH_LATE_SIP_CONS", + "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by SIP_CONS prefetch request." + }, + { + "EventCode": "0x01a6", + "EventName": "PREFETCH_REFILL_CMC", + "PublicDescription": "PF/prefetch or PF/readclean request from CMC pf engine filled the L2 cache." + }, + { + "EventCode": "0x01a7", + "EventName": "PREFETCH_REFILL_BO", + "PublicDescription": "PF/prefetch or PF/readclean request from BO pf engine filled the L2 cache." + }, + { + "EventCode": "0x01a8", + "EventName": "PREFETCH_REFILL_STRIDE", + "PublicDescription": "PF/prefetch or PF/readclean request from STRIDE pf engine filled the L2 cache." + }, + { + "EventCode": "0x01a9", + "EventName": "PREFETCH_REFILL_SPATIAL", + "PublicDescription": "PF/prefetch or PF/readclean request from SPATIAL pf engine filled the L2 cache." + }, + { + "EventCode": "0x01ab", + "EventName": "PREFETCH_REFILL_TBW", + "PublicDescription": "PF/prefetch or PF/readclean request from TBW pf engine filled the L2 cache." + }, + { + "EventCode": "0x01ac", + "EventName": "PREFETCH_REFILL_PAGE", + "PublicDescription": "PF/prefetch or PF/readclean request from PAGE pf engine filled the L2 cache." + }, + { + "EventCode": "0x01ad", + "EventName": "PREFETCH_REFILL_GSMS", + "PublicDescription": "PF/prefetch or PF/readclean request from GSMS pf engine filled the L2 cache." + }, + { + "EventCode": "0x01ae", + "EventName": "PREFETCH_REFILL_SIP_CONS", + "PublicDescription": "PF/prefetch or PF/readclean request from SIP_CONS pf engine filled the L2 cache." + }, + { + "EventCode": "0x01af", + "EventName": "CACHE_HIT_LINE_PF_CMC", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by CMC prefetch request." + }, + { + "EventCode": "0x01b0", + "EventName": "CACHE_HIT_LINE_PF_BO", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by BO prefetch request." + }, + { + "EventCode": "0x01b1", + "EventName": "CACHE_HIT_LINE_PF_STRIDE", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by STRIDE prefetch request." + }, + { + "EventCode": "0x01b2", + "EventName": "CACHE_HIT_LINE_PF_SPATIAL", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by SPATIAL prefetch request." + }, + { + "EventCode": "0x01b4", + "EventName": "CACHE_HIT_LINE_PF_TBW", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by TBW prefetch request." + }, + { + "EventCode": "0x01b5", + "EventName": "CACHE_HIT_LINE_PF_PAGE", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by PAGE prefetch request." + }, + { + "EventCode": "0x01b6", + "EventName": "CACHE_HIT_LINE_PF_GSMS", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by GSMS prefetch request." + }, + { + "EventCode": "0x01b7", + "EventName": "CACHE_HIT_LINE_PF_SIP_CONS", + "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by SIP_CONS prefetch request." + }, + { + "EventCode": "0x01ba", + "EventName": "PREFETCH_LATE_STORE_ISSUE", + "PublicDescription": "This event counts the number of demand requests that matches a Store-issue prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." + }, + { + "EventCode": "0x01bb", + "EventName": "PREFETCH_LATE_STORE_STRIDE", + "PublicDescription": "This event counts the number of demand requests that matches a Store-stride prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." + }, + { + "EventCode": "0x01bc", + "EventName": "PREFETCH_LATE_PC_OFFSET", + "PublicDescription": "This event counts the number of demand requests that matches a PC-offset prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." + }, + { + "EventCode": "0x01bd", + "EventName": "PREFETCH_LATE_IFUPF", + "PublicDescription": "This event counts the number of demand requests that matches a IFU prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements." + }, + { + "EventCode": "0x01be", + "EventName": "PREFETCH_REFILL_STORE_ISSUE", + "PublicDescription": "This event counts the number of cache refills due to Store-Issue prefetcher." + }, + { + "EventCode": "0x01bf", + "EventName": "PREFETCH_REFILL_STORE_STRIDE", + "PublicDescription": "This event counts the number of cache refills due to Store-stride prefetcher." + }, + { + "EventCode": "0x01c0", + "EventName": "PREFETCH_REFILL_PC_OFFSET", + "PublicDescription": "This event counts the number of cache refills due to PC-offset prefetcher." + }, + { + "EventCode": "0x01c1", + "EventName": "PREFETCH_REFILL_IFUPF", + "PublicDescription": "This event counts the number of cache refills due to IFU prefetcher." + }, + { + "EventCode": "0x01c2", + "EventName": "CACHE_HIT_LINE_PF_STORE_ISSUE", + "PublicDescription": "This event counts the number of first hit to a cache line filled by Store-issue prefetcher." + }, + { + "EventCode": "0x01c3", + "EventName": "CACHE_HIT_LINE_PF_STORE_STRIDE", + "PublicDescription": "This event counts the number of first hit to a cache line filled by Store-stride prefetcher." + }, + { + "EventCode": "0x01c4", + "EventName": "CACHE_HIT_LINE_PF_PC_OFFSET", + "PublicDescription": "This event counts the number of first hit to a cache line filled by PC-offset prefetcher." + }, + { + "EventCode": "0x01c5", + "EventName": "CACHE_HIT_LINE_PF_IFUPF", + "PublicDescription": "This event counts the number of first hit to a cache line filled by IFU prefetcher." + }, + { + "EventCode": "0x01c6", + "EventName": "L2_PF_GEN_ST_ISSUE", + "PublicDescription": "Store-issue prefetch to L2 generated." + }, + { + "EventCode": "0x01c7", + "EventName": "L2_PF_GEN_ST_STRIDE", + "PublicDescription": "Store-stride prefetch to L2 generated" + }, + { + "EventCode": "0x01cb", + "EventName": "L2_TQ_OUTSTANDING", + "PublicDescription": "Outstanding tracker count, per cycle.\nThis event increments by the number of valid entries pertaining to this thread in the L2TQ, in each cycle.\nThis event can be used to calculate the occupancy of L2TQ by dividing this by the CPU_CYCLES event. The L2TQ queue tracks the outstanding Read, Write and Snoop transactions. The Read transaction and the Write transaction entries are attributable to PE, whereas the Snoop transactions are not always attributable to PE." + }, + { + "EventCode": "0x01cc", + "EventName": "TXREQ_LIMIT_COUNT_CYCLES", + "PublicDescription": "This event increments by the dynamic TXREQ value, in each cycle.\nThis is a companion event of TXREQ_LIMIT_MAX_CYCLES, TXREQ_LIMIT_3QUARTER_CYCLES, TXREQ_LIMIT_HALF_CYCLES, and TXREQ_LIMIT_1QUARTER_CYCLES." + }, + { + "EventCode": "0x01ce", + "EventName": "L3DPRFM_TO_L2PRQ_CONVERTED", + "PublicDescription": "This event counts the number of Converted-L3D-PRFMs. These are indeed L3D PRFM and activities around these PRFM are counted by the L3D_CACHE_PRFM, L3D_CACHE_REFILL_PRFM and L3D_CACHE_REFILL Events." + }, + { + "EventCode": "0x01d2", + "EventName": "DVM_TLBI_RCVD", + "PublicDescription": "This event counts the number of TLBI DVM message received over CHI interface, for *this* Core." + }, + { + "EventCode": "0x01d6", + "EventName": "DSB_COMMITING_LOCAL_TLBI", + "PublicDescription": "This event counts the number of DSB that are retired and committed at least one local TLBI instruction. This event increments no more than once (in a cycle) even if the DSB commits multiple local TLBI instruction." + }, + { + "EventCode": "0x01d7", + "EventName": "DSB_COMMITING_BROADCAST_TLBI", + "PublicDescription": "This event counts the number of DSB that are retired and committed at least one broadcast TLBI instruction. This event increments no more than once (in a cycle) even if the DSB commits multiple broadcast TLBI instruction." + }, + { + "EventCode": "0x01eb", + "EventName": "L1DPRFM_L2DPRFM_TO_L2PRQ_CONVERTED", + "PublicDescription": "This event counts the number of Converted-L1D-PRFMs and Converted-L2D-PRFM.\nActivities involving the Converted-L1D-PRFM are counted by the L1D_CACHE_PRFM. However they are *not* counted by the L1D_CACHE_REFILL_PRFM, and L1D_CACHE_REFILL, as these Converted-L1D-PRFM are treated as L2 D hardware prefetches. Activities around the Converted-L1D-PRFMs and Converted-L2D-PRFMs are counted by the L2D_CACHE_PRFM, L2D_CACHE_REFILL_PRFM and L2D_CACHE_REFILL Events." + }, + { + "EventCode": "0x01ec", + "EventName": "PREFETCH_LATE_CONVERTED_PRFM", + "PublicDescription": "This event counts the number of demand requests that matches a Converted-L1D-PRFM or Converted-L2D-PRFM pending refill request at L2 D-cache. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements.\nNote that this event is not counted by the L2D_CACHE_HIT_RWL1PRF_LATE_HWPRF, though the Converted-L1D-PRFM or Converted-L2D-PRFM are replayed by the L2PRQ." + }, + { + "EventCode": "0x01ed", + "EventName": "PREFETCH_REFILL_CONVERTED_PRFM", + "PublicDescription": "This event counts the number of L2 D-cache refills due to Converted-L1D-PRFM or Converted-L2D-PRFM.\nNote : L2D_CACHE_REFILL_PRFM is inclusive of PREFETCH_REFILL_PRFM_CONVERTED, where both the PREFETCH_REFILL_PRFM_CONVERTED and the L2D_CACHE_REFILL_PRFM increment when L2 D-cache refills due to Converted-L1D-PRFM or Converted-L2D-PRFM." + }, + { + "EventCode": "0x01ee", + "EventName": "CACHE_HIT_LINE_PF_CONVERTED_PRFM", + "PublicDescription": "This event counts the number of first hit to a cache line filled by Converted-L1D-PRFM or Converted-L2D-PRFM.\nNote that L2D_CACHE_HIT_RWL1PRF_FPRFM is inclusive of CACHE_HIT_LINE_PF_CONVERTED_PRFM, where both the CACHE_HIT_LINE_PF_CONVERTED_PRFM and the L2D_CACHE_HIT_RWL1PRF_FPRFM increment on a first hit to L2 D-cache filled by Converted-L1D-PRFM or Converted-L2D-PRFM." + }, + { + "EventCode": "0x01f0", + "EventName": "TMS_ST_TO_SMT_LATENCY", + "PublicDescription": "This event counts the number of CPU cycles spent on TMS for ST-to-SMT switch.\nThis event is counted by both the threads - This event in both threads increment during TMS for ST-to-SMT switch." + }, + { + "EventCode": "0x01f1", + "EventName": "TMS_SMT_TO_ST_LATENCY", + "PublicDescription": "This event counts the number of CPU cycles spent on TMS for SMT-to-ST switch. The count also includes the CPU cycles spend due to an aborted SMT-to-ST TMS attempt.\nThis event is counted only by the thread that is not in WFI." + }, + { + "EventCode": "0x01f2", + "EventName": "TMS_ST_TO_SMT_COUNT", + "PublicDescription": "This event counts the number of completed TMS from ST-to-SMT.\nThis event is counted only by the active thread (the one that is not in WFI).\nNote: When an active thread enters the Debug state in ST-Full resource mode, it is switched to SMT mode. This is because the inactive thread cannot wake up while the other thread remains in the Debug state. To prEvent this issue, threads operating in ST-Full resource mode are transitioned to SMT mode upon entering Debug state. This event count will also reflect such switches from ST to SMT mode.\n(Also see the (NV_CPUACTLR14_EL1.chka_prEvent_st_tx_to_smt_when_tx_in_debug_state bit to disable this behavior.)" + }, + { + "EventCode": "0x01f3", + "EventName": "TMS_SMT_TO_ST_COUNT", + "PublicDescription": "This event counts the number of completed TMS from SMT-to-ST.\nThis event is counted only by the thread that is not in WFI." + }, + { + "EventCode": "0x01f4", + "EventName": "TMS_SMT_TO_ST_COUNT_ABRT", + "PublicDescription": "This event counts the number of aborted TMS from SMT-to-ST.\nThis event is counted only by the thread that is not in WFI." + }, + { + "EventCode": "0x0202", + "EventName": "L0I_CACHE_RD", + "PublicDescription": "This event counts the number of predict blocks serviced out of L0 I-cache.\nNote: The L0 I-cache performs at most 4 L0 I look-up in a cycle. Two of which are to service PB from L0 I. And the other two to refill L0 I-cache from L1 I. This event count only the L0 I-cache lookup pertaining to servicing the PB from L0 I." + }, + { + "EventCode": "0x0203", + "EventName": "L0I_CACHE_REFILL", + "PublicDescription": "This event counts the number of L0I cache refill from L1 I-cache." + }, + { + "EventCode": "0x0207", + "EventName": "INTR_LATENCY", + "PublicDescription": "This event counts the number of cycles elapsed between when an Interrupt is recognized (after masking) to when a uop associated with the first instruction in the destination exception level is allocated. If there is some other flush condition that pre-empts the Interrupt, then the cycles counted terminates early at the first instruction executed after that flush. In the event of dropped Interrupts (when an Interrupt is deasserted before it is taken), this counter measures the number of cycles that elapse from the moment an Interrupt is recognized (post-masking) until the Interrupt is dropped or deasserted.\nNote that\n* IESB(Implicit Error Synchronization Barrier) is an internal mop, so the latency of an implicit IESB mop executed before the Interrupt taken is included in the Interrupt latency count.\n* Nukes or TMS sequence within the window are also counted by the Interrupt latency Event.\n* A SMT to ST TMS will be aborted on detecting the wake condition for the WFI thread. The Interrupt latency count includes any additional penalty for an aborted TMS." + }, + { + "EventCode": "0x021c", + "EventName": "CWT_ALLOC_ENTRY", + "PublicDescription": "Cache Way Tracker Allocate entry." + }, + { + "EventCode": "0x021d", + "EventName": "CWT_ALLOC_LINE", + "PublicDescription": "Cache Way Tracker Allocate line." + }, + { + "EventCode": "0x021e", + "EventName": "CWT_HIT", + "PublicDescription": "Cache Way Tracker hit." + }, + { + "EventCode": "0x021f", + "EventName": "CWT_HIT_TAG", + "PublicDescription": "Cache Way Tracker hit when ITAG lookup suppressed." + }, + { + "EventCode": "0x0220", + "EventName": "CWT_REPLAY_TAG", + "PublicDescription": "Cache Way Tracker causes ITAG replay due to miss when ITAG lookup suppressed." + }, + { + "EventCode": "0x0250", + "EventName": "GPT_REQ", + "PublicDescription": "GPT lookup." + }, + { + "EventCode": "0x0251", + "EventName": "GPT_WC_HIT", + "PublicDescription": "GPT lookup hit in Walk cache." + }, + { + "EventCode": "0x0252", + "EventName": "GPT_PG_HIT", + "PublicDescription": "GPT lookup hit in TLB." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json new file mode 100644 index 0000000000000..34c7eefa66b05 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json @@ -0,0 +1,94 @@ +[ + { + "ArchStdEvent": "INST_RETIRED", + "PublicDescription": "This event counts instructions that have been architecturally executed." + }, + { + "ArchStdEvent": "CID_WRITE_RETIRED", + "PublicDescription": "This event counts architecturally executed writes to the CONTEXTIDR_EL1 register, which usually contains the kernel PID and can be output with hardware trace." + }, + { + "ArchStdEvent": "BR_IMMED_RETIRED", + "PublicDescription": "This event counts architecturally executed direct branches." + }, + { + "ArchStdEvent": "BR_RETURN_RETIRED", + "PublicDescription": "This event counts architecturally executed procedure returns." + }, + { + "ArchStdEvent": "TTBR_WRITE_RETIRED", + "PublicDescription": "This event counts architectural writes to TTBR0/1_EL1. If virtualization host extensions are enabled (by setting the HCR_EL2.E2H bit to 1), then accesses to TTBR0/1_EL1 that are redirected to TTBR0/1_EL2, or accesses to TTBR0/1_EL12, are counted. TTBRn registers are typically updated when the kernel is swapping user-space threads or applications." + }, + { + "ArchStdEvent": "BR_RETIRED", + "PublicDescription": "This event counts architecturally executed branches, whether the branch is taken or not. Instructions that explicitly write to the PC are also counted. Note that exception generating instructions, exception return instructions, and context synchronization instructions are not counted." + }, + { + "ArchStdEvent": "BR_MIS_PRED_RETIRED", + "PublicDescription": "This event counts branches counted by BR_RETIRED which were mispredicted and caused a pipeline flush." + }, + { + "ArchStdEvent": "OP_RETIRED", + "PublicDescription": "This event counts micro-operations that are architecturally executed. This is a count of number of micro-operations retired from the commit queue in a single cycle." + }, + { + "ArchStdEvent": "BR_INDNR_TAKEN_RETIRED", + "PublicDescription": "This event counts architecturally executed indirect branches excluding procedure returns that were taken." + }, + { + "ArchStdEvent": "BR_IMMED_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed direct branches that were correctly predicted." + }, + { + "ArchStdEvent": "BR_IMMED_MIS_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed direct branches that were mispredicted and caused a pipeline flush." + }, + { + "ArchStdEvent": "BR_IND_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed indirect branches including procedure returns that were correctly predicted." + }, + { + "ArchStdEvent": "BR_IND_MIS_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed indirect branches including procedure returns that were mispredicted and caused a pipeline flush." + }, + { + "ArchStdEvent": "BR_RETURN_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed procedure returns that were correctly predicted." + }, + { + "ArchStdEvent": "BR_RETURN_MIS_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed procedure returns that were mispredicted and caused a pipeline flush." + }, + { + "ArchStdEvent": "BR_INDNR_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed indirect branches excluding procedure returns that were correctly predicted." + }, + { + "ArchStdEvent": "BR_INDNR_MIS_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed indirect branches excluding procedure returns that were mispredicted and caused a pipeline flush." + }, + { + "ArchStdEvent": "BR_TAKEN_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed branches that were taken and were correctly predicted." + }, + { + "ArchStdEvent": "BR_TAKEN_MIS_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed branches that were taken and were mispredicted causing a pipeline flush." + }, + { + "ArchStdEvent": "BR_SKIP_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed branches that were not taken and were correctly predicted." + }, + { + "ArchStdEvent": "BR_SKIP_MIS_PRED_RETIRED", + "PublicDescription": "This event counts architecturally executed branches that were not taken and were mispredicted causing a pipeline flush." + }, + { + "ArchStdEvent": "BR_PRED_RETIRED", + "PublicDescription": "This event counts branch instructions counted by BR_RETIRED which were correctly predicted." + }, + { + "ArchStdEvent": "BR_IND_RETIRED", + "PublicDescription": "This event counts architecturally executed indirect branches including procedure returns." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json new file mode 100644 index 0000000000000..00d0c5051a482 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json @@ -0,0 +1,42 @@ +[ + { + "ArchStdEvent": "SAMPLE_POP", + "PublicDescription": "This event counts statistical profiling sample population, the count of all operations that could be sampled but may or may not be chosen for sampling." + }, + { + "ArchStdEvent": "SAMPLE_FEED", + "PublicDescription": "This event counts statistical profiling samples taken for sampling." + }, + { + "ArchStdEvent": "SAMPLE_FILTRATE", + "PublicDescription": "This event counts statistical profiling samples taken which are not removed by filtering." + }, + { + "ArchStdEvent": "SAMPLE_COLLISION", + "PublicDescription": "This event counts statistical profiling samples that have collided with a previous sample and so therefore not taken." + }, + { + "ArchStdEvent": "SAMPLE_FEED_BR", + "PublicDescription": "This event counts statistical profiling samples taken which are branches." + }, + { + "ArchStdEvent": "SAMPLE_FEED_LD", + "PublicDescription": "This event counts statistical profiling samples taken which are Loads or Load atomic operations." + }, + { + "ArchStdEvent": "SAMPLE_FEED_ST", + "PublicDescription": "This event counts statistical profiling samples taken which are Stores or Store atomic operations." + }, + { + "ArchStdEvent": "SAMPLE_FEED_OP", + "PublicDescription": "This event counts statistical profiling samples taken which are matching any operation type filters supported." + }, + { + "ArchStdEvent": "SAMPLE_FEED_EVENT", + "PublicDescription": "This event counts statistical profiling samples taken which are matching event packet filter constraints." + }, + { + "ArchStdEvent": "SAMPLE_FEED_LAT", + "PublicDescription": "This event counts statistical profiling samples taken which are exceeding minimum latency set by operation latency filter constraints." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json new file mode 100644 index 0000000000000..8bc802f5f3500 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json @@ -0,0 +1,230 @@ +[ + { + "ArchStdEvent": "INST_SPEC", + "PublicDescription": "This event counts operations that have been speculatively executed." + }, + { + "ArchStdEvent": "OP_SPEC", + "PublicDescription": "This event counts micro-operations speculatively executed. This is the count of the number of micro-operations dispatched in a cycle." + }, + { + "ArchStdEvent": "UNALIGNED_LD_SPEC", + "PublicDescription": "This event counts unaligned memory Read operations issued by the CPU. This event counts unaligned accesses (as defined by the actual instruction), even if they are subsequently issued as multiple aligned accesses.\nThis event does not count preload operations (PLD, PLI).\nThis event is a subset of the UNALIGNED_LDST_SPEC event." + }, + { + "ArchStdEvent": "UNALIGNED_ST_SPEC", + "PublicDescription": "This event counts unaligned memory Write operations issued by the CPU. This event counts unaligned accesses (as defined by the actual instruction), even if they are subsequently issued as multiple aligned accesses.\nThis event is a subset of the UNALIGNED_LDST_SPEC event." + }, + { + "ArchStdEvent": "UNALIGNED_LDST_SPEC", + "PublicDescription": "This event counts unaligned memory operations issued by the CPU. This event counts unaligned accesses (as defined by the actual instruction), even if they are subsequently issued as multiple aligned accesses.\nThis event is the sum of the following events:\nUNALIGNED_ST_SPEC and\nUNALIGNED_LD_SPEC." + }, + { + "ArchStdEvent": "LDREX_SPEC", + "PublicDescription": "This event counts Load-Exclusive operations that have been speculatively executed. For example: LDREX, LDX" + }, + { + "ArchStdEvent": "STREX_PASS_SPEC", + "PublicDescription": "This event counts Store-exclusive operations that have been speculatively executed and have successfully completed the Store operation." + }, + { + "ArchStdEvent": "STREX_FAIL_SPEC", + "PublicDescription": "This event counts Store-exclusive operations that have been speculatively executed and have not successfully completed the Store operation." + }, + { + "ArchStdEvent": "STREX_SPEC", + "PublicDescription": "This event counts Store-exclusive operations that have been speculatively executed.\nThis event is the sum of the following events:\nSTREX_PASS_SPEC and\nSTREX_FAIL_SPEC." + }, + { + "ArchStdEvent": "LD_SPEC", + "PublicDescription": "This event counts speculatively executed Load operations including Single Instruction Multiple Data (SIMD) Load operations." + }, + { + "ArchStdEvent": "ST_SPEC", + "PublicDescription": "This event counts speculatively executed Store operations including Single Instruction Multiple Data (SIMD) Store operations." + }, + { + "ArchStdEvent": "LDST_SPEC", + "PublicDescription": "This event counts Load and Store operations that have been speculatively executed." + }, + { + "ArchStdEvent": "DP_SPEC", + "PublicDescription": "This event counts speculatively executed logical or arithmetic instructions such as MOV/MVN operations." + }, + { + "ArchStdEvent": "ASE_SPEC", + "PublicDescription": "This event counts speculatively executed Advanced SIMD operations excluding Load, Store, and Move micro-operations that move data to or from SIMD (vector) registers." + }, + { + "ArchStdEvent": "VFP_SPEC", + "PublicDescription": "This event counts speculatively executed floating point operations. This event does not count operations that move data to or from floating point (vector) registers." + }, + { + "ArchStdEvent": "PC_WRITE_SPEC", + "PublicDescription": "This event counts speculatively executed operations which cause software changes of the PC. Those operations include all taken branch operations." + }, + { + "ArchStdEvent": "CRYPTO_SPEC", + "PublicDescription": "This event counts speculatively executed cryptographic operations except for PMULL and VMULL operations." + }, + { + "ArchStdEvent": "BR_IMMED_SPEC", + "PublicDescription": "This event counts direct branch operations which are speculatively executed." + }, + { + "ArchStdEvent": "BR_RETURN_SPEC", + "PublicDescription": "This event counts procedure return operations (RET, RETAA and RETAB) which are speculatively executed." + }, + { + "ArchStdEvent": "BR_INDIRECT_SPEC", + "PublicDescription": "This event counts indirect branch operations including procedure returns, which are speculatively executed. This includes operations that force a software change of the PC, other than exception-generating operations and direct branch instructions. Some examples of the instructions counted by this event include BR Xn, RET, etc." + }, + { + "ArchStdEvent": "ISB_SPEC", + "PublicDescription": "This event counts ISB operations that are executed." + }, + { + "ArchStdEvent": "DSB_SPEC", + "PublicDescription": "This event counts DSB operations that are speculatively issued to Load/Store unit in the CPU." + }, + { + "ArchStdEvent": "DMB_SPEC", + "PublicDescription": "This event counts DMB operations that are speculatively issued to the Load/Store unit in the CPU. This event does not count implied barriers from Load-acquire/Store-release operations." + }, + { + "ArchStdEvent": "CSDB_SPEC", + "PublicDescription": "This event counts CSDB operations that are speculatively issued to the Load/Store unit in the CPU. This event does not count implied barriers from Load-acquire/Store-release operations." + }, + { + "ArchStdEvent": "RC_LD_SPEC", + "PublicDescription": "This event counts any Load acquire operations that are speculatively executed. For example: LDAR, LDARH, LDARB" + }, + { + "ArchStdEvent": "RC_ST_SPEC", + "PublicDescription": "This event counts any Store release operations that are speculatively executed. For example: STLR, STLRH, STLRB" + }, + { + "ArchStdEvent": "SIMD_INST_SPEC", + "PublicDescription": "This event counts speculatively executed operations that are SIMD or SVE vector operations or Advanced SIMD non-scalar operations." + }, + { + "ArchStdEvent": "ASE_INST_SPEC", + "PublicDescription": "This event counts speculatively executed Advanced SIMD operations." + }, + { + "ArchStdEvent": "SVE_INST_SPEC", + "PublicDescription": "This event counts speculatively executed operations that are SVE operations." + }, + { + "ArchStdEvent": "INT_SPEC", + "PublicDescription": "This event counts speculatively executed integer arithmetic operations." + }, + { + "ArchStdEvent": "SVE_PRED_SPEC", + "PublicDescription": "This event counts speculatively executed predicated SVE operations.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events." + }, + { + "ArchStdEvent": "SVE_PRED_EMPTY_SPEC", + "PublicDescription": "This event counts speculatively executed predicated SVE operations with no active predicate elements.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events." + }, + { + "ArchStdEvent": "SVE_PRED_FULL_SPEC", + "PublicDescription": "This event counts speculatively executed predicated SVE operations with all predicate elements active.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events." + }, + { + "ArchStdEvent": "SVE_PRED_PARTIAL_SPEC", + "PublicDescription": "This event counts speculatively executed predicated SVE operations with at least one but not all active predicate elements.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events." + }, + { + "ArchStdEvent": "SVE_PRED_NOT_FULL_SPEC", + "PublicDescription": "This event counts speculatively executed predicated SVE operations with at least one non active predicate elements.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events." + }, + { + "ArchStdEvent": "PRF_SPEC", + "PublicDescription": "This event counts speculatively executed operations that prefetch memory. For example, Scalar: PRFM, SVE: PRFB, PRFD, PRFH, or PRFW." + }, + { + "ArchStdEvent": "SVE_LDFF_SPEC", + "PublicDescription": "This event counts speculatively executed SVE first fault or non-fault Load operations." + }, + { + "ArchStdEvent": "SVE_LDFF_FAULT_SPEC", + "PublicDescription": "This event counts speculatively executed SVE first fault or non-fault Load operations that clear at least one bit in the FFR." + }, + { + "ArchStdEvent": "ASE_SVE_INT8_SPEC", + "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type being an 8-bit integer." + }, + { + "ArchStdEvent": "ASE_SVE_INT16_SPEC", + "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type a 16-bit integer." + }, + { + "ArchStdEvent": "ASE_SVE_INT32_SPEC", + "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type a 32-bit integer." + }, + { + "ArchStdEvent": "ASE_SVE_INT64_SPEC", + "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type a 64-bit integer." + }, + { + "EventCode": "0x011d", + "EventName": "SPEC_RET_STACK_FULL", + "PublicDescription": "This event counts predict pipe stalls due to speculative return address predictor full." + }, + { + "EventCode": "0x011f", + "EventName": "MOPS_SPEC", + "PublicDescription": "Macro-ops speculatively decoded." + }, + { + "EventCode": "0x0180", + "EventName": "BR_SPEC_PRED_TAKEN", + "PublicDescription": "Number of predicted taken from branch predictor." + }, + { + "EventCode": "0x0181", + "EventName": "BR_SPEC_PRED_TAKEN_FROM_L2BTB", + "PublicDescription": "Number of predicted taken branch from L2 BTB." + }, + { + "EventCode": "0x0182", + "EventName": "BR_SPEC_PRED_TAKEN_MULTI", + "PublicDescription": "Number of predicted taken for polymorphic branch." + }, + { + "EventCode": "0x0185", + "EventName": "BR_SPEC_PRED_STATIC", + "PublicDescription": "Number of post fetch prediction." + }, + { + "EventCode": "0x01d0", + "EventName": "TLBI_LOCAL_SPEC", + "PublicDescription": "A non-broadcast TLBI instruction executed (Speculatively or otherwise) on *this* PE." + }, + { + "EventCode": "0x01d1", + "EventName": "TLBI_BROADCAST_SPEC", + "PublicDescription": "A broadcast TLBI instruction executed (Speculatively or otherwise) on *this* PE." + }, + { + "EventCode": "0x01e7", + "EventName": "BR_SPEC_PRED_ALN_REDIR", + "PublicDescription": "BPU predict pipe align redirect (either AL-APQ hit/miss)." + }, + { + "EventCode": "0x0200", + "EventName": "SIMD_CRYPTO_INST_SPEC", + "PublicDescription": "SIMD, SVE, and CRYPTO instructions speculatively decoded." + }, + { + "EventCode": "0x022e", + "EventName": "VPRED_LD_SPEC", + "PublicDescription": "This event counts the number of Speculatively-executed-Load operations with addresses produced by the value-prediction mechanism. The loaded data might be discarded if the predicted address differs from the actual address." + }, + { + "EventCode": "0x022f", + "EventName": "VPRED_LD_SPEC_MISMATCH", + "PublicDescription": "This event counts a subset of VPRED_LD_SPEC where the predicted Load address and the actual address mismatched." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json new file mode 100644 index 0000000000000..92d9e0866c247 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json @@ -0,0 +1,145 @@ +[ + { + "ArchStdEvent": "STALL_FRONTEND", + "PublicDescription": "This event counts cycles when frontend could not send any micro-operations to the rename stage because of frontend resource stalls caused by fetch memory latency or branch prediction flow stalls. STALL_FRONTEND_SLOTS counts SLOTS during the cycle when this event counts. STALL_SLOT_FRONTEND will count SLOTS when this event is counted on this CPU." + }, + { + "ArchStdEvent": "STALL_BACKEND", + "PublicDescription": "This event counts cycles whenever the rename unit is unable to send any micro-operations to the backend of the pipeline because of backend resource constraints. Backend resource constraints can include issue stage fullness, execution stage fullness, or other internal pipeline resource fullness. All the backend slots were empty during the cycle when this event counts." + }, + { + "ArchStdEvent": "STALL", + "PublicDescription": "This event counts cycles when no operations are sent to the rename unit from the frontend or from the rename unit to the backend for any reason (either frontend or backend stall). This event is the sum of the following events:\nSTALL_FRONTEND and\nSTALL_BACKEND." + }, + { + "ArchStdEvent": "STALL_SLOT_BACKEND", + "PublicDescription": "This event counts slots per cycle in which no operations are sent from the rename unit to the backend due to backend resource constraints. STALL_BACKEND counts during the cycle when STALL_SLOT_BACKEND counts at least 1. STALL_BACKEND counts during the cycle when STALL_SLOT_BACKEND is SLOTS." + }, + { + "ArchStdEvent": "STALL_SLOT_FRONTEND", + "PublicDescription": "This event counts slots per cycle in which no operations are sent to the rename unit from the frontend due to frontend resource constraints. STALL_FRONTEND counts during the cycle when STALL_SLOT_FRONTEND is SLOTS." + }, + { + "ArchStdEvent": "STALL_SLOT", + "PublicDescription": "This event counts slots per cycle in which no operations are sent to the rename unit from the frontend or from the rename unit to the backend for any reason (either frontend or backend stall).\nSTALL_SLOT is the sum of the following events:\nSTALL_SLOT_FRONTEND and\nSTALL_SLOT_BACKEND." + }, + { + "ArchStdEvent": "STALL_BACKEND_MEM", + "PublicDescription": "This event counts cycles when the backend is stalled because there is a pending demand Load request in progress in the last level Core cache.\nLast level cache in this CPU is Level 2, hence this event counts same as STALL_BACKEND_L2D." + }, + { + "ArchStdEvent": "STALL_FRONTEND_MEMBOUND", + "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the memory resources." + }, + { + "ArchStdEvent": "STALL_FRONTEND_L1I", + "PublicDescription": "This event counts cycles when the frontend is stalled because there is an instruction fetch request pending in the L1 I-cache." + }, + { + "ArchStdEvent": "STALL_FRONTEND_MEM", + "PublicDescription": "This event counts cycles when the frontend is stalled because there is an instruction fetch request pending in the last level Core cache.\nLast level cache in this CPU is Level 2, hence this event counts rather than STALL_FRONTEND_L2I." + }, + { + "ArchStdEvent": "STALL_FRONTEND_TLB", + "PublicDescription": "This event counts when the frontend is stalled on any TLB misses being handled. This event also counts the TLB accesses made by hardware prefetches." + }, + { + "ArchStdEvent": "STALL_FRONTEND_CPUBOUND", + "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the CPU resources excluding memory resources." + }, + { + "ArchStdEvent": "STALL_FRONTEND_FLOW", + "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the branch prediction unit." + }, + { + "ArchStdEvent": "STALL_FRONTEND_FLUSH", + "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage as the frontend is recovering from a machine flush or resteer. Example scenarios that cause a flush include branch mispredictions, taken exceptions, microarchitectural flush etc." + }, + { + "ArchStdEvent": "STALL_BACKEND_MEMBOUND", + "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations due to resource constraints in the memory resources." + }, + { + "ArchStdEvent": "STALL_BACKEND_L1D", + "PublicDescription": "This event counts cycles when the backend is stalled because there is a pending demand Load request in progress in the L1 D-cache." + }, + { + "ArchStdEvent": "STALL_BACKEND_TLB", + "PublicDescription": "This event counts cycles when the backend is stalled on any demand TLB misses being handled." + }, + { + "ArchStdEvent": "STALL_BACKEND_ST", + "PublicDescription": "This event counts cycles when the backend is stalled and there is a Store that has not reached the pre-commit stage." + }, + { + "ArchStdEvent": "STALL_BACKEND_CPUBOUND", + "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations due to any resource constraints in the CPU excluding memory resources." + }, + { + "ArchStdEvent": "STALL_BACKEND_BUSY", + "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations because the issue queues are full to take any operations for execution." + }, + { + "ArchStdEvent": "STALL_BACKEND_ILOCK", + "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations due to resource constraints imposed by input dependency." + }, + { + "ArchStdEvent": "STALL_BACKEND_RENAME", + "PublicDescription": "This event counts cycles when backend is stalled even when operations are available from the frontend but at least one is not ready to be sent to the backend because no rename register is available." + }, + { + "EventCode": "0x0158", + "EventName": "FLAG_DISP_STALL", + "PublicDescription": "Rename stalled due to FRF(Flag register file) full." + }, + { + "EventCode": "0x0159", + "EventName": "GEN_DISP_STALL", + "PublicDescription": "Rename stalled due to GRF (General-purpose register file) full." + }, + { + "EventCode": "0x015a", + "EventName": "VEC_DISP_STALL", + "PublicDescription": "Rename stalled due to VRF (Vector register file) full." + }, + { + "EventCode": "0x015c", + "EventName": "SX_IQ_STALL", + "PublicDescription": "Dispatch stalled due to IQ full, SX." + }, + { + "EventCode": "0x015d", + "EventName": "MX_IQ_STALL", + "PublicDescription": "Dispatch stalled due to IQ full, MX." + }, + { + "EventCode": "0x015e", + "EventName": "LS_IQ_STALL", + "PublicDescription": "Dispatch stalled due to IQ full, LS." + }, + { + "EventCode": "0x015f", + "EventName": "VX_IQ_STALL", + "PublicDescription": "Dispatch stalled due to IQ full, VX." + }, + { + "EventCode": "0x0160", + "EventName": "MCQ_FULL_STALL", + "PublicDescription": "Dispatch stalled due to MCQ full." + }, + { + "EventCode": "0x01cf", + "EventName": "PRD_DISP_STALL", + "PublicDescription": "Rename stalled due to predicate registers (physical) are full." + }, + { + "EventCode": "0x01e0", + "EventName": "CSDB_STALL", + "PublicDescription": "Rename stalled due to CSDB." + }, + { + "EventCode": "0x01e2", + "EventName": "STALL_SLOT_FRONTEND_WITHOUT_MISPRED", + "PublicDescription": "Stall slot frontend during non-mispredicted branch.\nThis event counts the STALL_STOT_FRONTEND Events, except for the 4 cycles following a mispredicted branch Event or 4 cycles following a commit flush&restart Event." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json new file mode 100644 index 0000000000000..18ec5c348c873 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json @@ -0,0 +1,158 @@ +[ + { + "ArchStdEvent": "L1I_TLB_REFILL", + "PublicDescription": "This event counts L1 Instruction TLB refills from any instruction fetch (demand, hardware prefetch, and software preload accesses). If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB." + }, + { + "ArchStdEvent": "L1D_TLB_REFILL", + "PublicDescription": "This event counts L1 Data TLB accesses that resulted in TLB refills. If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event counts for refills caused by preload instructions or hardware prefetch accesses. This event counts regardless of whether the miss hits in L2 or results in a translation table walk. This event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This event will not count on an access from an AT (Address Translation) instruction.\nThis event counts the sum of the following events:\nL1D_TLB_REFILL_RD and\nL1D_TLB_REFILL_WR." + }, + { + "ArchStdEvent": "L1D_TLB", + "PublicDescription": "This event counts L1 Data TLB accesses caused by any memory Load or Store operation.\nNote that Load or Store instructions can be broken up into multiple memory operations.\nThis event does not count TLB maintenance operations." + }, + { + "ArchStdEvent": "L1I_TLB", + "PublicDescription": "This event counts L1 instruction TLB accesses (caused by demand or hardware prefetch or software preload accesses), whether the access hits or misses in the TLB. This event counts both demand accesses and prefetch or preload generated accesses.\nThis event is a superset of the L1I_TLB_REFILL event." + }, + { + "ArchStdEvent": "L2D_TLB_REFILL", + "PublicDescription": "This event counts L2 TLB refills caused by memory operations from both data and instruction fetch, except for those caused by TLB maintenance operations and hardware prefetches.\nThis event is the sum of the following events:\nL2D_TLB_REFILL_RD and\nL2D_TLB_REFILL_WR." + }, + { + "ArchStdEvent": "L2D_TLB", + "PublicDescription": "This event counts L2 TLB accesses except those caused by TLB maintenance operations.\nThis event is the sum of the following events:\nL2D_TLB_RD and\nL2D_TLB_WR." + }, + { + "ArchStdEvent": "DTLB_WALK", + "PublicDescription": "This event counts number of demand data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this event does not count walks triggered by TLB maintenance operations.\nThis event does not include prefetches." + }, + { + "ArchStdEvent": "ITLB_WALK", + "PublicDescription": "This event counts number of instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this event does not count walks triggered by TLB maintenance operations.\nThis event does not include prefetches." + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_RD", + "PublicDescription": "This event counts L1 Data TLB refills caused by memory Read operations. If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event counts for refills caused by preload instructions or hardware prefetch accesses. This event counts regardless of whether the miss hits in L2 or results in a translation table walk. This event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This event will not count on an access from an Address Translation (AT) instruction.\nThis event is a subset of the L1D_TLB_REFILL event." + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_WR", + "PublicDescription": "This event counts L1 Data TLB refills caused by data side memory Write operations. If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event counts for refills caused by preload instructions or hardware prefetch accesses. This event counts regardless of whether the miss hits in L2 or results in a translation table walk. This event will not count if the table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This event will not count with an access from an Address Translation (AT) instruction.\nThis event is a subset of the L1D_TLB_REFILL event." + }, + { + "ArchStdEvent": "L1D_TLB_RD", + "PublicDescription": "This event counts L1 Data TLB accesses caused by memory Read operations. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations." + }, + { + "ArchStdEvent": "L1D_TLB_WR", + "PublicDescription": "This event counts any L1 Data side TLB accesses caused by memory Write operations. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations." + }, + { + "ArchStdEvent": "L2D_TLB_REFILL_RD", + "PublicDescription": "This event counts L2 TLB refills caused by memory Read operations from both data and instruction fetch except for those caused by TLB maintenance operations or hardware prefetches.\nThis event is a subset of the L2D_TLB_REFILL event." + }, + { + "ArchStdEvent": "L2D_TLB_REFILL_WR", + "PublicDescription": "This event counts L2 TLB refills caused by memory Write operations from both data and instruction fetch except for those caused by TLB maintenance operations.\nThis event is a subset of the L2D_TLB_REFILL event." + }, + { + "ArchStdEvent": "L2D_TLB_RD", + "PublicDescription": "This event counts L2 TLB accesses caused by memory Read operations from both data and instruction fetch except for those caused by TLB maintenance operations.\nThis event is a subset of the L2D_TLB event." + }, + { + "ArchStdEvent": "L2D_TLB_WR", + "PublicDescription": "This event counts L2 TLB accesses caused by memory Write operations from both data and instruction fetch except for those caused by TLB maintenance operations.\nThis event is a subset of the L2D_TLB event." + }, + { + "ArchStdEvent": "DTLB_WALK_PERCYC", + "PublicDescription": "This event counts the number of data translation table walks in progress per cycle." + }, + { + "ArchStdEvent": "ITLB_WALK_PERCYC", + "PublicDescription": "This event counts the number of instruction translation table walks in progress per cycle." + }, + { + "ArchStdEvent": "L1D_TLB_RW", + "PublicDescription": "This event counts L1 Data TLB demand accesses caused by memory Read or Write operations. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations." + }, + { + "ArchStdEvent": "L1I_TLB_RD", + "PublicDescription": "This event counts L1 Instruction TLB demand accesses whether the access hits or misses in the TLB." + }, + { + "ArchStdEvent": "L1D_TLB_PRFM", + "PublicDescription": "This event counts L1 Data TLB accesses generated by software prefetch or preload memory accesses. Load or Store instructions can be broken into multiple memory operations. This event does not count TLB maintenance operations." + }, + { + "ArchStdEvent": "L1I_TLB_PRFM", + "PublicDescription": "This event counts L1 Instruction TLB accesses generated by software preload or prefetch instructions. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations." + }, + { + "ArchStdEvent": "DTLB_HWUPD", + "PublicDescription": "This event counts number of memory accesses triggered by a data translation table walk and performing an update of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that this event counts accesses triggered by software preloads, but not accesses triggered by hardware prefetchers." + }, + { + "ArchStdEvent": "ITLB_HWUPD", + "PublicDescription": "This event counts number of memory accesses triggered by an instruction translation table walk and performing an update of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD." + }, + { + "ArchStdEvent": "DTLB_STEP", + "PublicDescription": "This event counts number of memory accesses triggered by a demand data translation table walk and performing a Read of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that this event counts accesses triggered by software preloads, but not accesses triggered by hardware prefetchers." + }, + { + "ArchStdEvent": "ITLB_STEP", + "PublicDescription": "This event counts number of memory accesses triggered by an instruction translation table walk and performing a Read of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD." + }, + { + "ArchStdEvent": "DTLB_WALK_LARGE", + "PublicDescription": "This event counts number of demand data translation table walks caused by a miss in the L2 TLB and yielding a large page. The set of large pages is defined as all pages with a final size higher than or equal to 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. If DTLB_WALK_BLOCK is implemented, then it is an alias for this event in this family.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "ITLB_WALK_LARGE", + "PublicDescription": "This event counts number of instruction translation table walks caused by a miss in the L2 TLB and yielding a large page. The set of large pages is defined as all pages with a final size higher than or equal to 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. In this family, this is equal to ITLB_WALK_BLOCK event.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "DTLB_WALK_SMALL", + "PublicDescription": "This event counts number of data translation table walks caused by a miss in the L2 TLB and yielding a small page. The set of small pages is defined as all pages with a final size lower than 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. If DTLB_WALK_PAGE event is implemented, then it is an alias for this event in this family. Note that partial translations that cause a translation table walk are also counted.\nAlso note that this event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "ITLB_WALK_SMALL", + "PublicDescription": "This event counts number of instruction translation table walks caused by a miss in the L2 TLB and yielding a small page. The set of small pages is defined as all pages with a final size lower than 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. In this family, this is equal to ITLB_WALK_PAGE event.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "DTLB_WALK_RW", + "PublicDescription": "This event counts number of demand data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "ITLB_WALK_RD", + "PublicDescription": "This event counts number of demand instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "DTLB_WALK_PRFM", + "PublicDescription": "This event counts number of software prefetches or preloads generated data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations." + }, + { + "ArchStdEvent": "ITLB_WALK_PRFM", + "PublicDescription": "This event counts number of software prefetches or preloads generated instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations." + }, + { + "EventCode": "0x010e", + "EventName": "L1D_TLB_REFILL_RD_PF", + "PublicDescription": "L1 Data TLB refill, Read, prefetch." + }, + { + "EventCode": "0x010f", + "EventName": "L2TLB_PF_REFILL", + "PublicDescription": "L2 Data TLB refill, Read, prefetch.\nThis event counts MMU refills due to internal PFStream requests." + }, + { + "EventCode": "0x0223", + "EventName": "L1I_TLB_REFILL_RD", + "PublicDescription": "L1 Instruction TLB refills due to Demand miss." + }, + { + "EventCode": "0x0224", + "EventName": "L1I_TLB_REFILL_PRFM", + "PublicDescription": "L1 Instruction TLB refills due to Software prefetch miss." + } +] From cd5faf674b6b9c189dcb8e2a8cf7d666a7a683a2 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Mon, 4 May 2026 17:52:04 +0000 Subject: [PATCH 17/17] NVIDIA: VR: SAUCE: perf/arm_pmu: Skip PMCCNTR_EL0 on NVIDIA Olympus PMCCNTR_EL0 may continue to increment on NVIDIA Olympus CPUs while the PE is in WFI/WFE. That does not necessarily match the CPU_CYCLES event counted by a programmable counter, so using PMCCNTR_EL0 for cycles can give results that differ from the programmable counter path. Extend the existing PMCCNTR avoidance decision from the SMT case to also cover Olympus. Store the result in the common arm_pmu state at registration time, so arm_pmuv3 can keep using a single flag when deciding whether CPU_CYCLES may use PMCCNTR_EL0. Signed-off-by: Besar Wicaksono (backported from https://lore.kernel.org/all/20260504175204.3122979-1-bwicaksono@nvidia.com/) Signed-off-by: Matthew R. Ochs --- drivers/perf/arm_pmu.c | 7 ++++- drivers/perf/arm_pmuv3.c | 51 +++++++++++++++++++++++++++++++----- include/linux/perf/arm_pmu.h | 2 +- 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 939bcbd433aab..aa1dac0b440fa 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -931,8 +931,13 @@ int armpmu_register(struct arm_pmu *pmu) /* * By this stage we know our supported CPUs on either DT/ACPI platforms, * detect the SMT implementation. + * On SMT CPUs, the PMCCNTR_EL0 increments from the processor clock rather + * than the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue + * counting on a WFI PE if one of its SMT sibling is not idle on a + * multi-threaded implementation. So don't use it on SMT cores. */ - pmu->has_smt = topology_core_has_smt(cpumask_first(&pmu->supported_cpus)); + pmu->avoid_pmccntr |= + topology_core_has_smt(cpumask_first(&pmu->supported_cpus)); if (!pmu->set_event_filter) pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 8014ff766cff5..1ee4a09d0dccb 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -8,6 +8,7 @@ * This code is based heavily on the ARMv7 perf event code. */ +#include #include #include #include @@ -1002,13 +1003,7 @@ static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, if (has_branch_stack(event)) return false; - /* - * The PMCCNTR_EL0 increments from the processor clock rather than - * the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue - * counting on a WFI PE if one of its SMT sibling is not idle on a - * multi-threaded implementation. So don't use it on SMT cores. - */ - if (cpu_pmu->has_smt) + if (cpu_pmu->avoid_pmccntr) return false; return true; @@ -1299,6 +1294,41 @@ static int armv8_vulcan_map_event(struct perf_event *event) &armv8_vulcan_perf_cache_map); } +#ifdef CONFIG_ARM64 +/* + * List of CPUs that should avoid using PMCCNTR_EL0. + */ +static struct midr_range armv8pmu_avoid_pmccntr_cpus[] = { + /* + * The PMCCNTR_EL0 in Olympus CPU may still increment while in WFI/WFE state. + * This is an implementation specific behavior and not an erratum. + * + * From ARM DDI0487 D14.4: + * It is IMPLEMENTATION SPECIFIC whether CPU_CYCLES and PMCCNTR count + * when the PE is in WFI or WFE state, even if the clocks are not stopped. + * + * From ARM DDI0487 D24.5.2: + * All counters are subject to any changes in clock frequency, including + * clock stopping caused by the WFI and WFE instructions. + * This means that it is CONSTRAINED UNPREDICTABLE whether or not + * PMCCNTR_EL0 continues to increment when clocks are stopped by WFI and + * WFE instructions. + */ + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + {} +}; + +static bool armv8pmu_is_in_avoid_pmccntr_cpus(void) +{ + return is_midr_in_range_list(armv8pmu_avoid_pmccntr_cpus); +} +#else +static bool armv8pmu_is_in_avoid_pmccntr_cpus(void) +{ + return false; +} +#endif + struct armv8pmu_probe_info { struct arm_pmu *pmu; bool present; @@ -1348,6 +1378,13 @@ static void __armv8pmu_probe_pmu(void *info) else cpu_pmu->reg_pmmir = 0; + /* + * On some CPUs, PMCCNTR_EL0 does not match the behavior of CPU_CYCLES + * programmable counter, so avoid routing cycles through PMCCNTR_EL0 to + * prevent inconsistency in the results. + */ + cpu_pmu->avoid_pmccntr |= armv8pmu_is_in_avoid_pmccntr_cpus(); + brbe_probe(cpu_pmu); } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 52b37f7bdbf9e..02d2c7f45b527 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -119,7 +119,7 @@ struct arm_pmu { /* PMUv3 only */ int pmuver; - bool has_smt; + bool avoid_pmccntr; u64 reg_pmmir; u64 reg_brbidr; #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40