Skip to content
This repository was archived by the owner on Jan 7, 2023. It is now read-only.

Commit df40b17

Browse files
Toni Lönnbergstrassek
authored andcommitted
FROMLIST: SQUASH: i965: SIMD32 selection heuristics
(cover letter https://patchwork.freedesktop.org/series/51006/) FROMLIST: i965: SIMD32 heuristics debug flag Added a new DEBUG_HEUR32 flag to INTEL_DEBUG flags for enabling SIMD32 selection heuristics. (am from https://patchwork.freedesktop.org/patch/256764/) FROMLIST: i965: SIMD32 heuristics control data Added a new structure for holding SIMD32 heuristics control data. The control data itself will be fetched from drirc. (am from https://patchwork.freedesktop.org/patch/256806/) FROMLIST: i965: SIMD32 heuristics control data from drirc To be able to test the heuristics with different parameters, they can be controlled via environment variables through drirc. (am from https://patchwork.freedesktop.org/patch/256788/) FROMLIST: mesa: Helper functions for counting set bits in a mask (am from https://patchwork.freedesktop.org/patch/256765/) FROMLIST: i965/fs: Save the instruction count of each dispatch width The SIMD32 selection heuristics will use this information for deciding whether SIMD32 shaders should be used. (am from https://patchwork.freedesktop.org/patch/256793/) FROMLIST: i965/fs: SIMD32 selection heuristic based on grouped texture fetches The function goes through the compiled shader and checks how many grouped texture fetches there are. This is a simple heuristic which gets rid of most of the regressions when enabling SIMD32 shaders but still retains some of the benefits. (am from https://patchwork.freedesktop.org/patch/256798/) FROMLIST: i965/fs: Enable all SIMD32 heuristics There are three simple heuristics for SIMD32 shader enabling: - How many MRTs does the shader write into? - How many grouped texture fetches does the shader have? - How many instructions does the SIMD32 shader have compared to the SIMD16 shader? For testing purposes, the heuristics can be controlled via these environment variables: simd32_heuristic_mrt_check - Enables MRT write check - Default: true simd32_heuristic_max_mrts - How many MRT writes the heuristic allows - Default: 1 simd32_heuristic_grouped_check - Enables grouped texture fetch check - Default: true simd32_heuristic_grouped_sends - How many grouped texture fetches the heuristic allows - Default: 6 simd32_heuristic_inst_check - Enables SIMD32 vs. SIMD16 instruction count check - Default: true simd32_heuristic_inst_ratio - SIMD32 vs. SIMD16 instruction count ratio the heuristic allows - Default: 2.3 SIMD32 shaders will not be compiled also when SIMD16 compilation fails or spills. (am from https://patchwork.freedesktop.org/patch/256766/)
1 parent be4913a commit df40b17

9 files changed

Lines changed: 152 additions & 7 deletions

File tree

src/intel/compiler/brw_compiler.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ struct ra_regs;
3838
struct nir_shader;
3939
struct brw_program;
4040

41+
struct brw_simd32_heuristics_control {
42+
bool grouped_sends_check;
43+
int max_grouped_sends;
44+
bool inst_count_check;
45+
float inst_count_ratio;
46+
bool mrt_check;
47+
int max_mrts;
48+
};
49+
4150
struct brw_compiler {
4251
const struct gen_device_info *devinfo;
4352

@@ -118,6 +127,8 @@ struct brw_compiler {
118127
* whether nir_opt_large_constants will be run.
119128
*/
120129
bool supports_shader_constants;
130+
131+
struct brw_simd32_heuristics_control simd32_heuristics_control;
121132
};
122133

123134
/**

src/intel/compiler/brw_fs.cpp

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7933,6 +7933,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
79337933
char **error_str)
79347934
{
79357935
const struct gen_device_info *devinfo = compiler->devinfo;
7936+
bool simd16_failed = false;
7937+
bool simd16_spilled = false;
79367938

79377939
shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
79387940
brw_nir_lower_fs_inputs(shader, devinfo, key);
@@ -7998,20 +8000,30 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
79988000
shader_time_index16);
79998001
v16.import_uniforms(&v8);
80008002
if (!v16.run_fs(allow_spilling, use_rep_send)) {
8003+
simd16_failed = true;
80018004
compiler->shader_perf_log(log_data,
80028005
"SIMD16 shader failed to compile: %s",
80038006
v16.fail_msg);
80048007
} else {
8008+
simd16_spilled = v16.spilled_any_registers;
80058009
simd16_cfg = v16.cfg;
80068010
prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
80078011
prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
80088012
}
80098013
}
80108014

80118015
/* Currently, the compiler only supports SIMD32 on SNB+ */
8016+
const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control;
8017+
uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0;
8018+
80128019
if (v8.max_dispatch_width >= 32 && !use_rep_send &&
80138020
compiler->devinfo->gen >= 6 &&
8014-
unlikely(INTEL_DEBUG & DEBUG_DO32)) {
8021+
(unlikely(INTEL_DEBUG & DEBUG_DO32) ||
8022+
(unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
8023+
!simd16_failed && !simd16_spilled &&
8024+
(!ctrl->mrt_check ||
8025+
(ctrl->mrt_check &&
8026+
u_count_bits64(&mrts) <= ctrl->max_mrts))))) {
80158027
/* Try a SIMD32 compile */
80168028
fs_visitor v32(compiler, log_data, mem_ctx, key,
80178029
&prog_data->base, prog, shader, 32,
@@ -8022,9 +8034,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
80228034
"SIMD32 shader failed to compile: %s",
80238035
v32.fail_msg);
80248036
} else {
8025-
simd32_cfg = v32.cfg;
8026-
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
8027-
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
8037+
if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) ||
8038+
v32.run_heuristic(ctrl)) {
8039+
simd32_cfg = v32.cfg;
8040+
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
8041+
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
8042+
}
80288043
}
80298044
}
80308045

@@ -8103,13 +8118,49 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
81038118
}
81048119

81058120
if (simd32_cfg) {
8106-
prog_data->dispatch_32 = true;
8107-
prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
8121+
uint32_t offset = g.generate_code(simd32_cfg, 32);
8122+
8123+
if (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
8124+
(unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
8125+
(!simd16_cfg ||
8126+
(simd16_cfg &&
8127+
(!ctrl->inst_count_check ||
8128+
(ctrl->inst_count_check &&
8129+
(float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) {
8130+
prog_data->dispatch_32 = true;
8131+
prog_data->prog_offset_32 = offset;
8132+
}
81088133
}
81098134

81108135
return g.get_assembly();
81118136
}
81128137

8138+
bool
8139+
fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) {
8140+
int grouped_sends = 0;
8141+
int max_grouped_sends = 0;
8142+
bool pass = true;
8143+
8144+
foreach_block_and_inst(block, fs_inst, inst, cfg) {
8145+
if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) {
8146+
++grouped_sends;
8147+
} else if (grouped_sends > 0) {
8148+
if (grouped_sends > max_grouped_sends) {
8149+
max_grouped_sends = grouped_sends;
8150+
}
8151+
grouped_sends = 0;
8152+
}
8153+
}
8154+
8155+
if (ctrl->grouped_sends_check) {
8156+
if (max_grouped_sends > ctrl->max_grouped_sends) {
8157+
pass = false;
8158+
}
8159+
}
8160+
8161+
return pass;
8162+
}
8163+
81138164
fs_reg *
81148165
fs_visitor::emit_cs_work_group_id_setup()
81158166
{

src/intel/compiler/brw_fs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ class fs_visitor : public backend_shader
300300
void dump_instruction(backend_instruction *inst);
301301
void dump_instruction(backend_instruction *inst, FILE *file);
302302

303+
bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);
304+
303305
const void *const key;
304306
const struct brw_sampler_prog_key_data *key_tex;
305307

@@ -420,6 +422,7 @@ class fs_generator
420422

421423
void enable_debug(const char *shader_name);
422424
int generate_code(const cfg_t *cfg, int dispatch_width);
425+
int get_inst_count(int dispatch_width);
423426
const unsigned *get_assembly();
424427

425428
private:
@@ -515,6 +518,7 @@ class fs_generator
515518
struct brw_stage_prog_data * const prog_data;
516519

517520
unsigned dispatch_width; /**< 8, 16 or 32 */
521+
int inst_count[3]; /* for 8, 16 and 32 */
518522

519523
exec_list discard_halt_patches;
520524
unsigned promoted_constants;

src/intel/compiler/brw_fs_generator.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2256,6 +2256,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
22562256
fill_count, promoted_constants, before_size,
22572257
after_size);
22582258

2259+
inst_count[ffs(dispatch_width) - 4] = before_size / 16;
2260+
22592261
return start_offset;
22602262
}
22612263

@@ -2264,3 +2266,13 @@ fs_generator::get_assembly()
22642266
{
22652267
return brw_get_program(p, &prog_data->program_size);
22662268
}
2269+
2270+
int
2271+
fs_generator::get_inst_count(int dispatch_width)
2272+
{
2273+
if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) {
2274+
return inst_count[ffs(dispatch_width) - 4];
2275+
} else {
2276+
return 0;
2277+
}
2278+
}

src/intel/dev/gen_debug.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ static const struct debug_control debug_control[] = {
8686
{ "color", DEBUG_COLOR },
8787
{ "reemit", DEBUG_REEMIT },
8888
{ "soft64", DEBUG_SOFT64 },
89+
{ "heur32", DEBUG_HEUR32 },
8990
{ NULL, 0 }
9091
};
9192

src/intel/dev/gen_debug.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,15 @@ extern uint64_t INTEL_DEBUG;
8484
#define DEBUG_COLOR (1ull << 40)
8585
#define DEBUG_REEMIT (1ull << 41)
8686
#define DEBUG_SOFT64 (1ull << 42)
87+
#define DEBUG_HEUR32 (1ull << 43)
8788

8889
/* These flags are not compatible with the disk shader cache */
8990
#define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
9091

9192
/* These flags may affect program generation */
9293
#define DEBUG_DISK_CACHE_MASK \
9394
(DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \
94-
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64)
95+
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | DEBUG_HEUR32)
9596

9697
#ifdef HAVE_ANDROID_PLATFORM
9798
#define LOG_TAG "INTEL-MESA"

src/mesa/drivers/dri/i965/brw_context.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,19 @@ brw_process_driconf_options(struct brw_context *brw)
914914
ctx->Const.dri_config_options_sha1 = ralloc_array(brw, unsigned char, 20);
915915
driComputeOptionsSha1(&brw->screen->optionCache,
916916
ctx->Const.dri_config_options_sha1);
917+
918+
brw->screen->compiler->simd32_heuristics_control.grouped_sends_check =
919+
driQueryOptionb(&brw->optionCache, "simd32_heuristic_grouped_check");
920+
brw->screen->compiler->simd32_heuristics_control.max_grouped_sends =
921+
driQueryOptioni(&brw->optionCache, "simd32_heuristic_grouped_sends");
922+
brw->screen->compiler->simd32_heuristics_control.inst_count_check =
923+
driQueryOptionb(&brw->optionCache, "simd32_heuristic_inst_check");
924+
brw->screen->compiler->simd32_heuristics_control.inst_count_ratio =
925+
driQueryOptionf(&brw->optionCache, "simd32_heuristic_inst_ratio");
926+
brw->screen->compiler->simd32_heuristics_control.mrt_check =
927+
driQueryOptionb(&brw->optionCache, "simd32_heuristic_mrt_check");
928+
brw->screen->compiler->simd32_heuristics_control.max_mrts =
929+
driQueryOptioni(&brw->optionCache, "simd32_heuristic_max_mrts");
917930
}
918931

919932
GLboolean

src/mesa/drivers/dri/i965/intel_screen.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,33 @@ DRI_CONF_BEGIN
6262
DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
6363
DRI_CONF_DESC_END
6464
DRI_CONF_OPT_END
65+
66+
DRI_CONF_OPT_BEGIN_B(simd32_heuristic_grouped_check, "true")
67+
DRI_CONF_DESC(en, "Enable/disable grouped texture fetch "
68+
"check in the SIMD32 selection heuristic.")
69+
DRI_CONF_OPT_END
70+
DRI_CONF_OPT_BEGIN_V(simd32_heuristic_grouped_sends, int, 6, "1:999")
71+
DRI_CONF_DESC(en, "How many grouped texture fetches should "
72+
"the SIMD32 selection heuristic allow.")
73+
DRI_CONF_OPT_END
74+
DRI_CONF_OPT_BEGIN_B(simd32_heuristic_inst_check, "true")
75+
DRI_CONF_DESC(en, "Enable/disable SIMD32/SIMD16 instruction "
76+
"count ratio check in the SIMD32 selection "
77+
"heuristic.")
78+
DRI_CONF_OPT_END
79+
DRI_CONF_OPT_BEGIN_V(simd32_heuristic_inst_ratio, float, 2.3, "1:999")
80+
DRI_CONF_DESC(en, "SIMD32/SIMD16 instruction count ratio "
81+
"the SIMD32 selection heuristic should allow.")
82+
DRI_CONF_OPT_END
83+
DRI_CONF_OPT_BEGIN_B(simd32_heuristic_mrt_check, "true")
84+
DRI_CONF_DESC(en, "Enable/disable MRT write check in the "
85+
"SIMD32 selection heuristic.")
86+
DRI_CONF_OPT_END
87+
DRI_CONF_OPT_BEGIN_V(simd32_heuristic_max_mrts, int, 1, "1:8")
88+
DRI_CONF_DESC(en, "How many MRT writes should the SIMD32 "
89+
"selection heuristic allow.")
90+
DRI_CONF_OPT_END
91+
6592
DRI_CONF_MESA_NO_ERROR("false")
6693
DRI_CONF_MESA_GLTHREAD("false")
6794
DRI_CONF_SECTION_END

src/util/bitscan.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,31 @@ u_bit_scan64(uint64_t *mask)
112112
return i;
113113
}
114114

115+
/* Count bits set in mask */
116+
static inline int
117+
u_count_bits(unsigned *mask)
118+
{
119+
unsigned v = *mask;
120+
int c;
121+
v = v - ((v >> 1) & 0x55555555);
122+
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
123+
v = (v + (v >> 4)) & 0xF0F0F0F;
124+
c = (int)((v * 0x1010101) >> 24);
125+
return c;
126+
}
127+
128+
static inline int
129+
u_count_bits64(uint64_t *mask)
130+
{
131+
uint64_t v = *mask;
132+
int c;
133+
v = v - ((v >> 1) & 0x5555555555555555ull);
134+
v = (v & 0x3333333333333333ull) + ((v >> 2) & 0x3333333333333333ull);
135+
v = (v + (v >> 4)) & 0xF0F0F0F0F0F0F0Full;
136+
c = (int)((v * 0x101010101010101ull) >> 56);
137+
return c;
138+
}
139+
115140
/* Determine if an unsigned value is a power of two.
116141
*
117142
* \note

0 commit comments

Comments
 (0)