Skip to content

Commit c256c20

Browse files
agrabezhigcbot
authored andcommitted
Drop SIMD32 to SIMD16 on non-VRT XE2 over GRF budget
Extend the early SIMD16-drop heuristic to non-VRT XE2 platforms: drop SIMD32 to SIMD16 when its register-pressure estimate exceeds the GRF budget. The drop is controlled by the existing AllowSIMD16DropForXE2Plus switch; the redundant AllowEarlySIMD16DropForXE3 flag is removed.
1 parent e0fae9f commit c256c20

3 files changed

Lines changed: 20 additions & 6 deletions

File tree

IGC/Compiler/CISACodeGen/OpenCLKernelCodeGen.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2601,7 +2601,21 @@ static bool shouldDropToSIMD16(uint32_t maxPressure, uint32_t simd16Pressure, ui
26012601
if (simdMode != SIMDMode::SIMD32 || !isEntryFunc(pMdUtils, F)) {
26022602
return false;
26032603
}
2604-
if (!pCtx->isAutoGRFSelectionEnabled() || pCtx->getNumGRFPerThread(false) != 0) {
2604+
2605+
bool autoGRF = pCtx->isAutoGRFSelectionEnabled();
2606+
2607+
// Non-VRT platforms have no VRT GRF step-up: SIMD32 is only profitable when
2608+
// its register pressure fits the GRF budget. Drop to SIMD16 when SIMD32
2609+
// pressure exceeds the budget -- the forced GRF count, otherwise 128 (256 in
2610+
// auto large-GRF mode).
2611+
if (pCtx->platform.isCoreXE2()) {
2612+
uint32_t grfBudget = pCtx->getNumGRFPerThread(false);
2613+
if (grfBudget == 0)
2614+
grfBudget = autoGRF ? 256 : 128;
2615+
return simd32Pressure > grfBudget;
2616+
}
2617+
2618+
if (!autoGRF || pCtx->getNumGRFPerThread(false) != 0) {
26052619
return false;
26062620
}
26072621

@@ -2729,7 +2743,8 @@ SIMDStatus COpenCLKernel::checkSIMDCompileCondsForMin16(SIMDMode simdMode, EmitP
27292743
}
27302744
}
27312745

2732-
if (EP.m_canAbortOnSpill && pCtx->platform.isCoreXE3() && IGC_IS_FLAG_ENABLED(AllowEarlySIMD16DropForXE3)) {
2746+
bool isSupportedCore = pCtx->platform.isCoreXE2() || pCtx->platform.isCoreXE3();
2747+
if (EP.m_canAbortOnSpill && isSupportedCore && IGC_IS_FLAG_ENABLED(AllowSIMD16DropForXE2Plus)) {
27332748
uint32_t simd16Pressure = getMaxPressureForSIMD(F, numLanes(SIMDMode::SIMD16));
27342749
uint32_t simd32Pressure = getMaxPressureForSIMD(F, numLanes(SIMDMode::SIMD32));
27352750
bool shouldDrop = shouldDropToSIMD16(maxPressure, simd16Pressure, simd32Pressure, simdMode, pCtx, pMdUtils, &F);

IGC/common/igc_flags.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1636,8 +1636,8 @@ DECLARE_IGC_REGKEY(bool, VectorizerLog, false, "Dump Vectorizer Log, usefull for
16361636
DECLARE_IGC_REGKEY(bool, VectorizerLogToErr, false, "Dump Vectorizer Log to stdErr", true)
16371637
DECLARE_IGC_REGKEY(bool, EnableReusingXYZWStoreConstPayload, true, "Enable reusing XYZW stores const payload", false)
16381638
DECLARE_IGC_REGKEY(bool, EnableReusingLSCStoreConstPayload, false, "Enable reusing LSC stores const payload", false)
1639-
DECLARE_IGC_REGKEY(bool, AllowSIMD16DropForXE2Plus, true, "Controls the switch for XE2 and XE3 simd16 drop", false)
1640-
DECLARE_IGC_REGKEY(bool, AllowEarlySIMD16DropForXE3, true, "Controls the early drop to simd16 for XE3", false)
1639+
DECLARE_IGC_REGKEY(bool, AllowSIMD16DropForXE2Plus, true,
1640+
"Controls the switch for XE2 and XE3 simd16 drop, including the early RPE-based drop", false)
16411641
DECLARE_IGC_REGKEY(DWORD, EarlySIMD16DropForXE3Threshold, 256, "Threshold for the early drop to simd16 for XE3", false)
16421642
DECLARE_IGC_REGKEY(DWORD, OCLVRTSimd16DropSimd32High, 160,
16431643
"Drop SIMD32 to SIMD16 on VRT platforms when SIMD32 RPE exceeds this value", false)

documentation/configuration_flags.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -683,10 +683,9 @@ $ export IGC_ShaderDumpEnable=1
683683
| `AllocateZeroInitializedVarsInBss` | Allocate zero initialized global variables in .bss section in ZEBinary | Available |
684684
| `AllowConstMadOpMovToReg` | Enable matching of mad instruction if constant greater than 16-bits. This will generate a mov in vISA for the constant operand due to it not fitting as an imm16 operand. At this point, the generated asm likely will fall back onto mul+add for the main case where src1 is the constant | - |
685685
| `AllowCrossBlockMatchMad` | Enable cross basic block matching of mad instructions. This may lead to increased register pressure, but in exchange, may reduce instruction count | - |
686-
| `AllowEarlySIMD16DropForXE3` | Controls the early drop to simd16 for XE3 | - |
687686
| `AllowMultipleMulUsesMatchMad` | Enable a multiply instruction with multiple uses to be matched to a mad instruction. This essentially forces the recalculation of the intermediate multiply result for every potential mad instruction, which will have performance impacts but may reduce instruction count and register pressure in case both mul operands need to be live past the add/sub but the intermediate mul result does not. | - |
688687
| `AllowNonLoopConstantPromotion` | Allows promotion for constants not in loop (e.g. used once) | - |
689-
| `AllowSIMD16DropForXE2Plus` | Controls the switch for XE2 and XE3 simd16 drop | - |
688+
| `AllowSIMD16DropForXE2Plus` | Controls the switch for XE2 and XE3 simd16 drop, including the early RPE-based drop | - |
690689
| `AllowStackCallRetry` | Enable/Disable retry when stack function spill. 0 - Don't allow, 1 - Allow retry on kernel group, 2 - Allow retry per function | - |
691690
| `BlockFrequencySampling` | Use block frequencies to derive a distribution | Available |
692691
| `ByPassAllocaSizeHeuristic` | Force some Alloca to pass the pressure heuristic until the given size | Available |

0 commit comments

Comments
 (0)