From 27b754cef3d6a2c12f4cefeb1c22c6067db45894 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Mon, 21 Mar 2022 16:45:03 -0400 Subject: [PATCH 01/18] end_warp's calculation was not totally accurate --- src/gpgpu-sim/shader.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 814311d1c..7b5665ba1 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -522,8 +522,10 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, m_threadState[i].n_insn = 0; m_threadState[i].m_cta_id = -1; } - for (unsigned i = start_thread / m_config->warp_size; - i < end_thread / m_config->warp_size; ++i) { + const unsigned start_warp = start_thread / m_config->warp_size; + const unsigned end_warp = end_thread / m_config->warp_size + + ((end_thread % m_config->warp_size) ? 1 : 0); + for (unsigned i = start_warp; i < end_warp; ++i) { m_warp[i]->reset(); m_simt_stack[i]->reset(); } From a8a89d107fe372759c1db42758a0c2172cd91670 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Mon, 21 Mar 2022 16:45:59 -0400 Subject: [PATCH 02/18] the way how hwtid is assigned looks subobtimal. Is this the way how NVIDIA's GPUs work? --- src/gpgpu-sim/gpu-sim.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 5af244b33..0a0150544 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1586,6 +1586,9 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { } } +//confusion: Seems like this function is seeking a contiguous range of hwtid that starts +//from an integer multiple of cta_size. This can leave holes in the range of hwtids. +//Is this overly restrictive? int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { unsigned int step; for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { From 20f252b508c8cbd27406fa86da53371eb12fb766 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Tue, 22 Mar 2022 22:14:22 -0400 Subject: [PATCH 03/18] added some documentation for core functions relavant to subcore scheduling --- src/gpgpu-sim/gpu-sim.cc | 12 ++++++++++++ src/gpgpu-sim/shader.cc | 17 ++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 0a0150544..dafebce5e 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1589,6 +1589,15 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { //confusion: Seems like this function is seeking a contiguous range of hwtid that starts //from an integer multiple of cta_size. This can leave holes in the range of hwtids. //Is this overly restrictive? +/** + * @brief Tries to find a contiguous range of available {hw_tid}s (and mark them as occupied). + * + * @param cta_size How many threads this CTA contains. Should already be + * "padded" to an integer multiple of the max warp size (m_config->warp_size) + * @param occupy Set to false for a dry run + * @return -1 if a contiguous range that can fit all threads of this cta + * cannot be found, otherwise the hw_tid to which the first thread of this cta maps + */ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { unsigned int step; for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { @@ -1706,6 +1715,9 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { if (!m_config->gpgpu_concurrent_kernel_sm) set_max_cta(kernel); else + //shader_core_ctx::can_issue_1block should have already verified that one block + //is indeed issueable on this shader core, therefore we expect + //occupy_shader_resource_1block to return true here. assert(occupy_shader_resource_1block(kernel, true)); kernel.inc_running(); diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 7b5665ba1..33b1def13 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -534,7 +534,8 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned end_thread, unsigned ctaid, int cta_size, kernel_info_t &kernel) { - // + //when concurrent_sm is enabled, + //both start_thread and end_thread are hwtid (0 <= x < n_thread_per_shader) address_type start_pc = next_pc(start_thread); unsigned kernel_id = kernel.get_uid(); if (m_config->model == POST_DOMINATOR) { @@ -3339,6 +3340,20 @@ void shader_core_ctx::display_pipeline(FILE *fout, int print_mem, } } +/** + * @brief Given the resource requirements per CTA of a kernel, calculate how + * many such CTAs can a shader core sustain when it is "empty". In other words, + * it checks if the CTA is too "fat" to fit on a core; if it can, how many. + * + * Although this function is declared to be const (promises not to modify any + * state of the shader_core_config class), it also checks if + * adaptive_cache_config is + * enabled and if yes, it might modify some states of the cache configuration. + * Read the code yourself if you are concerned! + * + * @param k + * @return unsigned int How many CTAs of the kernel can be sustained on a core. + */ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const { unsigned threads_per_cta = k.threads_per_cta(); const class function_info *kernel = k.entry(); From 005db44976e9abc3934f97445c0a271396afd424 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Sat, 26 Mar 2022 18:00:17 -0400 Subject: [PATCH 04/18] added debugging support for subcore modelling --- configs/tested-cfgs/SM7_QV100/gpgpusim.config | 6 ++++- src/gpgpu-sim/gpu-sim.cc | 23 +++++++++++++------ src/gpgpu-sim/shader.cc | 1 + src/gpgpu-sim/shader.h | 5 ++-- src/trace_streams.tup | 1 + 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index 8d2b10199..24e258390 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -234,4 +234,8 @@ # tracing functionality #-trace_enabled 1 #-trace_components WARP_SCHEDULER,SCOREBOARD -#-trace_sampling_core 0 \ No newline at end of file +#-trace_sampling_core 0 + +-trace_enabled 1 +-trace_components SUBCORE +-trace_sampling_core -1 \ No newline at end of file diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index dafebce5e..609ae0225 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1594,11 +1594,12 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { * * @param cta_size How many threads this CTA contains. Should already be * "padded" to an integer multiple of the max warp size (m_config->warp_size) - * @param occupy Set to false for a dry run + * @param occupy Set to "false" for a dry run * @return -1 if a contiguous range that can fit all threads of this cta * cannot be found, otherwise the hw_tid to which the first thread of this cta maps */ -int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { +int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) { + //TODO: use round robin based on dynamic_warp id; leave no gaps. unsigned int step; for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { unsigned int hw_tid; @@ -1608,12 +1609,15 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { if (hw_tid == step + cta_size) // consecutive non-active break; } - if (step >= m_config->n_thread_per_shader) // didn't find + if (step >= m_config->n_thread_per_shader){ // didn't find + DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid()); return -1; + } else { if (occupy) { for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++) m_occupied_hwtid.set(hw_tid); + DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, step, step+cta_size-1, kernel.get_uid()); } return step; } @@ -1631,13 +1635,16 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader) return false; - if (find_available_hwtid(padded_cta_size, false) == -1) return false; + if (find_available_hwtid(padded_cta_size, k, false) == -1) return false; const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size) return false; + //TODO: check if each subcore has enough regs for this block + //this requires tracking the amount of available regs per subcore, + //plus knowning how many warps are to be issued on each subcore. unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3); if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers) return false; @@ -1661,7 +1668,7 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, } void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, - kernel_info_t &k) { + const kernel_info_t &k) { if (m_config->gpgpu_concurrent_kernel_sm) { unsigned threads_per_cta = k.threads_per_cta(); const class function_info *kernel = k.entry(); @@ -1678,6 +1685,7 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size; hwtid++) m_occupied_hwtid.reset(hwtid); + DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid()); m_occupied_cta_to_hwtid.erase(hw_ctaid); const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); @@ -1714,11 +1722,12 @@ unsigned exec_shader_core_ctx::sim_init_thread( void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { if (!m_config->gpgpu_concurrent_kernel_sm) set_max_cta(kernel); - else + else{ //shader_core_ctx::can_issue_1block should have already verified that one block //is indeed issueable on this shader core, therefore we expect //occupy_shader_resource_1block to return true here. assert(occupy_shader_resource_1block(kernel, true)); + } kernel.inc_running(); @@ -1755,7 +1764,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { start_thread = free_cta_hw_id * padded_cta_size; end_thread = start_thread + cta_size; } else { - start_thread = find_available_hwtid(padded_cta_size, true); + start_thread = find_available_hwtid(padded_cta_size, kernel, true); assert((int)start_thread != -1); end_thread = start_thread + cta_size; assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) == diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 33b1def13..67c09a28d 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -477,6 +477,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu, config->max_barriers_per_cta, config->warp_size), m_active_warps(0), m_dynamic_warp_id(0) { + m_cluster = cluster; m_config = config; m_memory_config = mem_config; diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h index c3e6f93ed..8ef30c969 100644 --- a/src/gpgpu-sim/shader.h +++ b/src/gpgpu-sim/shader.h @@ -2514,8 +2514,8 @@ class shader_core_ctx : public core_t { public: bool can_issue_1block(kernel_info_t &kernel); bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy); - void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel); - int find_available_hwtid(unsigned int cta_size, bool occupy); + void release_shader_resource_1block(unsigned hw_ctaid, const kernel_info_t &kernel); + int find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy); private: unsigned int m_occupied_n_threads; @@ -2559,6 +2559,7 @@ class exec_shader_core_ctx : public shader_core_ctx { }; class simt_core_cluster { + friend class shader_core_ctx; public: simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id, const shader_core_config *config, diff --git a/src/trace_streams.tup b/src/trace_streams.tup index 074c7c880..4457f6c25 100644 --- a/src/trace_streams.tup +++ b/src/trace_streams.tup @@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type ) TS_TUP( MEMORY_SUBPARTITION_UNIT ), TS_TUP( INTERCONNECT ), TS_TUP( LIVENESS ), + TS_TUP( SUBCORE ), TS_TUP( NUM_TRACE_STREAMS ) TS_TUP_END( trace_streams_type ) From 7bb066731d9fa73f07457f46f007f3e83ffca4c1 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Sat, 26 Mar 2022 18:00:17 -0400 Subject: [PATCH 05/18] added debugging support for subcore modelling also added a const kernel_info_t& to the arg list of shader_core_ctx::find_available_hwtid. This allows trace calls within this function to be able to find the SM id. --- configs/tested-cfgs/SM7_QV100/gpgpusim.config | 6 ++++- src/gpgpu-sim/gpu-sim.cc | 23 +++++++++++++------ src/gpgpu-sim/shader.cc | 1 + src/gpgpu-sim/shader.h | 5 ++-- src/trace_streams.tup | 1 + 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index 8d2b10199..24e258390 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -234,4 +234,8 @@ # tracing functionality #-trace_enabled 1 #-trace_components WARP_SCHEDULER,SCOREBOARD -#-trace_sampling_core 0 \ No newline at end of file +#-trace_sampling_core 0 + +-trace_enabled 1 +-trace_components SUBCORE +-trace_sampling_core -1 \ No newline at end of file diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index dafebce5e..609ae0225 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1594,11 +1594,12 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { * * @param cta_size How many threads this CTA contains. Should already be * "padded" to an integer multiple of the max warp size (m_config->warp_size) - * @param occupy Set to false for a dry run + * @param occupy Set to "false" for a dry run * @return -1 if a contiguous range that can fit all threads of this cta * cannot be found, otherwise the hw_tid to which the first thread of this cta maps */ -int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { +int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) { + //TODO: use round robin based on dynamic_warp id; leave no gaps. unsigned int step; for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { unsigned int hw_tid; @@ -1608,12 +1609,15 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { if (hw_tid == step + cta_size) // consecutive non-active break; } - if (step >= m_config->n_thread_per_shader) // didn't find + if (step >= m_config->n_thread_per_shader){ // didn't find + DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid()); return -1; + } else { if (occupy) { for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++) m_occupied_hwtid.set(hw_tid); + DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, step, step+cta_size-1, kernel.get_uid()); } return step; } @@ -1631,13 +1635,16 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader) return false; - if (find_available_hwtid(padded_cta_size, false) == -1) return false; + if (find_available_hwtid(padded_cta_size, k, false) == -1) return false; const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size) return false; + //TODO: check if each subcore has enough regs for this block + //this requires tracking the amount of available regs per subcore, + //plus knowning how many warps are to be issued on each subcore. unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3); if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers) return false; @@ -1661,7 +1668,7 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, } void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, - kernel_info_t &k) { + const kernel_info_t &k) { if (m_config->gpgpu_concurrent_kernel_sm) { unsigned threads_per_cta = k.threads_per_cta(); const class function_info *kernel = k.entry(); @@ -1678,6 +1685,7 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size; hwtid++) m_occupied_hwtid.reset(hwtid); + DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid()); m_occupied_cta_to_hwtid.erase(hw_ctaid); const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); @@ -1714,11 +1722,12 @@ unsigned exec_shader_core_ctx::sim_init_thread( void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { if (!m_config->gpgpu_concurrent_kernel_sm) set_max_cta(kernel); - else + else{ //shader_core_ctx::can_issue_1block should have already verified that one block //is indeed issueable on this shader core, therefore we expect //occupy_shader_resource_1block to return true here. assert(occupy_shader_resource_1block(kernel, true)); + } kernel.inc_running(); @@ -1755,7 +1764,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { start_thread = free_cta_hw_id * padded_cta_size; end_thread = start_thread + cta_size; } else { - start_thread = find_available_hwtid(padded_cta_size, true); + start_thread = find_available_hwtid(padded_cta_size, kernel, true); assert((int)start_thread != -1); end_thread = start_thread + cta_size; assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) == diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 33b1def13..67c09a28d 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -477,6 +477,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu, config->max_barriers_per_cta, config->warp_size), m_active_warps(0), m_dynamic_warp_id(0) { + m_cluster = cluster; m_config = config; m_memory_config = mem_config; diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h index c3e6f93ed..8ef30c969 100644 --- a/src/gpgpu-sim/shader.h +++ b/src/gpgpu-sim/shader.h @@ -2514,8 +2514,8 @@ class shader_core_ctx : public core_t { public: bool can_issue_1block(kernel_info_t &kernel); bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy); - void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel); - int find_available_hwtid(unsigned int cta_size, bool occupy); + void release_shader_resource_1block(unsigned hw_ctaid, const kernel_info_t &kernel); + int find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy); private: unsigned int m_occupied_n_threads; @@ -2559,6 +2559,7 @@ class exec_shader_core_ctx : public shader_core_ctx { }; class simt_core_cluster { + friend class shader_core_ctx; public: simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id, const shader_core_config *config, diff --git a/src/trace_streams.tup b/src/trace_streams.tup index 074c7c880..4457f6c25 100644 --- a/src/trace_streams.tup +++ b/src/trace_streams.tup @@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type ) TS_TUP( MEMORY_SUBPARTITION_UNIT ), TS_TUP( INTERCONNECT ), TS_TUP( LIVENESS ), + TS_TUP( SUBCORE ), TS_TUP( NUM_TRACE_STREAMS ) TS_TUP_END( trace_streams_type ) From 9601dfb89b3128d6d76c6c9f5e2bdf0a8ae533ad Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Tue, 29 Mar 2022 10:56:52 -0400 Subject: [PATCH 06/18] implemented RR in find_hwtid --- src/gpgpu-sim/gpu-sim.cc | 50 +++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 609ae0225..37ba6134d 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1599,15 +1599,34 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { * cannot be found, otherwise the hw_tid to which the first thread of this cta maps */ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) { - //TODO: use round robin based on dynamic_warp id; leave no gaps. - unsigned int step; - for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { - unsigned int hw_tid; - for (hw_tid = step; hw_tid < step + cta_size; hw_tid++) { - if (m_occupied_hwtid.test(hw_tid)) break; + //TODO: use round robin based on dynamic_warp id + const unsigned int& warp_size = m_config->warp_size; + + unsigned int step=0; + while(step < m_config->n_thread_per_shader) { + //Subcore experiments on Volta V100 + //show that warps are assigned to subcores in a Round-Robin fashion, + //so we should start testing from the successor of the subcore + //to which the last warp was assigned. + + //Note: Warp ids are bound to a specific scheduler - which + //is equivalent to a subcore - based on (warp_id modulo # of schedulers) + + //m_dynamic_warp_id is incremented after a warp has been initiated, + //therefore we don't need to add one to find the "next" subcore + //(ref: shader_core_ctx::init_warps) + unsigned int i; + for (i = step; i < step + cta_size; i++) { + unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; + if (m_occupied_hwtid.test(hw_tid)) break; //break from this inner for-loop + } + if (i == step + cta_size) // consecutive non-active + break; //break from the outer while-loop + else { + //start from the next warp slot + //e.g. if step was 32, i was 35, and warp_size is 32, then step will be updated to 64 + step = (i / warp_size + 1) * warp_size; } - if (hw_tid == step + cta_size) // consecutive non-active - break; } if (step >= m_config->n_thread_per_shader){ // didn't find DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid()); @@ -1615,11 +1634,13 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_in } else { if (occupy) { - for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++) + for (unsigned i = step; i < step + cta_size; i++){ + unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; m_occupied_hwtid.set(hw_tid); - DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, step, step+cta_size-1, kernel.get_uid()); + } + DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid()); } - return step; + return (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; } } @@ -1635,6 +1656,13 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader) return false; + //Even if the amount of available "thread slots" exceed our CTA size, + //if these slots are fragmented (non-continuous regions), + //we still might not be able to launch this CTA. + //Obviously fragmentation can only happen on the granularity of warp size + //since hwtids are allocated on the granularity of warp_size. + //It remains a TODO to find out if a CTA *can* launch when the warps of this CTA + //have no choice but map to non contiguous regions of hwtid. if (find_available_hwtid(padded_cta_size, k, false) == -1) return false; const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); From 7bcda79823895e2007a29f2a213ccaadf8fa1d0f Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Tue, 29 Mar 2022 21:45:40 -0400 Subject: [PATCH 07/18] added DTRACE to sucore-related code; addressed wrap-around issue --- src/gpgpu-sim/gpu-sim.cc | 100 +++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 30 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 37ba6134d..36db78d82 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1585,18 +1585,18 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { return (get_n_active_cta() < m_config->max_cta(kernel)); } } - -//confusion: Seems like this function is seeking a contiguous range of hwtid that starts -//from an integer multiple of cta_size. This can leave holes in the range of hwtids. -//Is this overly restrictive? + /** * @brief Tries to find a contiguous range of available {hw_tid}s (and mark them as occupied). + * Wrap-arounds are allowed. * * @param cta_size How many threads this CTA contains. Should already be * "padded" to an integer multiple of the max warp size (m_config->warp_size) * @param occupy Set to "false" for a dry run * @return -1 if a contiguous range that can fit all threads of this cta - * cannot be found, otherwise the hw_tid to which the first thread of this cta maps + * cannot be found, otherwise the hw_tid to which the first thread of this cta maps. Note + * that since wrap-arounds can happen, naively adding cta_size to the retval - which is the + * start_thread - can result in a value exceeding the simulated hardware limits */ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) { //TODO: use round robin based on dynamic_warp id @@ -1634,11 +1634,11 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_in } else { if (occupy) { + DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid()); for (unsigned i = step; i < step + cta_size; i++){ unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; m_occupied_hwtid.set(hw_tid); } - DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid()); } return (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; } @@ -1710,10 +1710,10 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, int start_thread = m_occupied_cta_to_hwtid[hw_ctaid]; + DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid()); for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size; hwtid++) m_occupied_hwtid.reset(hwtid); - DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid()); m_occupied_cta_to_hwtid.erase(hw_ctaid); const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); @@ -1791,18 +1791,34 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { if (!m_config->gpgpu_concurrent_kernel_sm) { start_thread = free_cta_hw_id * padded_cta_size; end_thread = start_thread + cta_size; + end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1; } else { start_thread = find_available_hwtid(padded_cta_size, kernel, true); assert((int)start_thread != -1); end_thread = start_thread + cta_size; + //It is necessary to perform a wrap-around. See impl. of find_available_hwtid. + end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1; assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) == m_occupied_cta_to_hwtid.end()); m_occupied_cta_to_hwtid[free_cta_hw_id] = start_thread; } + // A lot of legacy function that take in a range of thread ids + // are built upon the assumption that no wrap-around happens. + // However with the subcore model this is no longer true. We need + // to separately process the two regions of thread id if wrap-around happens + const bool wrap_around_happens = (end_thread < start_thread); + // reset the microarchitecture state of the selected hardware thread and warp // contexts - reinit(start_thread, end_thread, false); + if(!wrap_around_happens){ + reinit(start_thread, end_thread, false); + } + else{ + reinit(start_thread, m_config->n_thread_per_shader, false); + reinit(0, end_thread, false); + } + // initalize scalar threads and determine which hardware warps they are // allocated to bind functional simulation state of threads to hardware @@ -1813,29 +1829,46 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { symbol_table *symtab = kernel_func_info->get_symtab(); unsigned ctaid = kernel.get_next_cta_id_single(); checkpoint *g_checkpoint = new checkpoint(); - for (unsigned i = start_thread; i < end_thread; i++) { - m_threadState[i].m_cta_id = free_cta_hw_id; - unsigned warp_id = i / m_config->warp_size; - nthreads_in_block += sim_init_thread( - kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread), - m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id, - m_cluster->get_gpu()); - m_threadState[i].m_active = true; - // load thread local memory and register file - if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel && - ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) { - char fname[2048]; - snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt", - i % cta_size, ctaid); - m_thread[i]->resume_reg_thread(fname, symtab); - char f1name[2048]; - snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt", - i % cta_size, ctaid); - g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name); + + // here is the definition of a lambda that faciliates the processing of + // disjoint thread regions in the case of wrap-around + // Everything is captured by reference so any modification within the + // lambda can affect the outer value being referenced + auto prepare_threads = [&](unsigned int _start_thread, unsigned int _end_thread) { + for (unsigned i = start_thread; i < end_thread; i++) { + m_threadState[i].m_cta_id = free_cta_hw_id; + unsigned warp_id = i / m_config->warp_size; + nthreads_in_block += sim_init_thread( + kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread), + m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id, + m_cluster->get_gpu()); + m_threadState[i].m_active = true; + // load thread local memory and register file + if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel && + ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) { + char fname[2048]; + snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt", + i % cta_size, ctaid); + m_thread[i]->resume_reg_thread(fname, symtab); + char f1name[2048]; + snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt", + i % cta_size, ctaid); + g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name); + } + // + warps.set(warp_id); } - // - warps.set(warp_id); + }; + + //the lambda is invoked here + if(!wrap_around_happens){ + prepare_threads(start_thread, end_thread); } + else{ + prepare_threads(start_thread, m_config->n_thread_per_shader); + prepare_threads(0, end_thread); + } + assert(nthreads_in_block > 0 && nthreads_in_block <= m_config->n_thread_per_shader); // should be at least one, but @@ -1854,7 +1887,14 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { m_barriers.allocate_barrier(free_cta_hw_id, warps); // initialize the SIMT stacks and fetch hardware - init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel); + if(!wrap_around_happens){ + init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel); + } + else{ + init_warps(free_cta_hw_id, start_thread, m_config->n_thread_per_shader, ctaid, cta_size, kernel); + init_warps(free_cta_hw_id, 0, end_thread, ctaid, cta_size, kernel); + } + m_n_active_cta++; shader_CTA_count_log(m_sid, 1); From caafcb3dd6d17222f4713b34343ea0990c9bbb62 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Tue, 29 Mar 2022 23:43:03 -0400 Subject: [PATCH 08/18] fixed coding mistake in previous commit --- src/gpgpu-sim/gpu-sim.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 36db78d82..bae63e121 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1830,16 +1830,20 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { unsigned ctaid = kernel.get_next_cta_id_single(); checkpoint *g_checkpoint = new checkpoint(); - // here is the definition of a lambda that faciliates the processing of - // disjoint thread regions in the case of wrap-around + //used to pass in as the "threads_left" argument passed to sim_init_thread + int threads_left = cta_size; + + // Here is the definition of a lambda that faciliates the processing of + // disjoint thread regions in the case of tid wrap-around. // Everything is captured by reference so any modification within the // lambda can affect the outer value being referenced + // Note we are using a lambda-local _start_thread / _end_thread value auto prepare_threads = [&](unsigned int _start_thread, unsigned int _end_thread) { - for (unsigned i = start_thread; i < end_thread; i++) { + for (unsigned i = _start_thread; i < _end_thread; i++) { m_threadState[i].m_cta_id = free_cta_hw_id; unsigned warp_id = i / m_config->warp_size; nthreads_in_block += sim_init_thread( - kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread), + kernel, &m_thread[i], m_sid, i, cta_size--, m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id, m_cluster->get_gpu()); m_threadState[i].m_active = true; From cc3789ff37aad443ca7987a6d7adce834b2fceac Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 13:40:13 -0400 Subject: [PATCH 09/18] use range-based for loop to iterate over hwtids which might have wrap-arounds due to subcore scheduling Plus a utility function get_index_vector_from_range_with_wrap_around used to generate the vector of indices. --- src/gpgpu-sim/gpu-sim.cc | 96 +++++++++++++++------------------------- src/gpgpu-sim/gpu-sim.h | 37 ++++++++++++++++ src/gpgpu-sim/shader.cc | 20 +++++++-- 3 files changed, 89 insertions(+), 64 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index bae63e121..6c3cddef8 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1786,8 +1786,16 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { padded_cta_size = ((cta_size / m_config->warp_size) + 1) * (m_config->warp_size); - unsigned int start_thread, end_thread; + //find available hwtids + // Note: A lot of legacy function that take in a range of thread ids + // are built upon the assumption that no wrap-around happens. + // However with the subcore model this is no longer true. + // It is hence necessary to perform a wrap-around. + // E.g. to demo the effect off wrap-around,if CTA size is 10, + // n_thread_per_shader is 20 and start_thread is 18, end thread will + // not be 28 but 8. + unsigned int start_thread, end_thread; if (!m_config->gpgpu_concurrent_kernel_sm) { start_thread = free_cta_hw_id * padded_cta_size; end_thread = start_thread + cta_size; @@ -1796,30 +1804,17 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { start_thread = find_available_hwtid(padded_cta_size, kernel, true); assert((int)start_thread != -1); end_thread = start_thread + cta_size; - //It is necessary to perform a wrap-around. See impl. of find_available_hwtid. + end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1; assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) == m_occupied_cta_to_hwtid.end()); m_occupied_cta_to_hwtid[free_cta_hw_id] = start_thread; } - // A lot of legacy function that take in a range of thread ids - // are built upon the assumption that no wrap-around happens. - // However with the subcore model this is no longer true. We need - // to separately process the two regions of thread id if wrap-around happens - const bool wrap_around_happens = (end_thread < start_thread); - // reset the microarchitecture state of the selected hardware thread and warp // contexts - if(!wrap_around_happens){ - reinit(start_thread, end_thread, false); - } - else{ - reinit(start_thread, m_config->n_thread_per_shader, false); - reinit(0, end_thread, false); - } + reinit(start_thread, end_thread, false); - // initalize scalar threads and determine which hardware warps they are // allocated to bind functional simulation state of threads to hardware // resources (simulation) @@ -1833,46 +1828,31 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { //used to pass in as the "threads_left" argument passed to sim_init_thread int threads_left = cta_size; - // Here is the definition of a lambda that faciliates the processing of - // disjoint thread regions in the case of tid wrap-around. - // Everything is captured by reference so any modification within the - // lambda can affect the outer value being referenced - // Note we are using a lambda-local _start_thread / _end_thread value - auto prepare_threads = [&](unsigned int _start_thread, unsigned int _end_thread) { - for (unsigned i = _start_thread; i < _end_thread; i++) { - m_threadState[i].m_cta_id = free_cta_hw_id; - unsigned warp_id = i / m_config->warp_size; - nthreads_in_block += sim_init_thread( - kernel, &m_thread[i], m_sid, i, cta_size--, - m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id, - m_cluster->get_gpu()); - m_threadState[i].m_active = true; - // load thread local memory and register file - if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel && - ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) { - char fname[2048]; - snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt", - i % cta_size, ctaid); - m_thread[i]->resume_reg_thread(fname, symtab); - char f1name[2048]; - snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt", - i % cta_size, ctaid); - g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name); - } - // - warps.set(warp_id); + auto tids = get_index_vector_from_range_with_wrap_around + (start_thread, end_thread, m_config->max_warps_per_shader); + for (unsigned i : tids) { + m_threadState[i].m_cta_id = free_cta_hw_id; + unsigned warp_id = i / m_config->warp_size; + nthreads_in_block += sim_init_thread( + kernel, &m_thread[i], m_sid, i, threads_left--, + m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id, + m_cluster->get_gpu()); + m_threadState[i].m_active = true; + // load thread local memory and register file + if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel && + ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) { + char fname[2048]; + snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt", + i % cta_size, ctaid); + m_thread[i]->resume_reg_thread(fname, symtab); + char f1name[2048]; + snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt", + i % cta_size, ctaid); + g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name); } - }; - - //the lambda is invoked here - if(!wrap_around_happens){ - prepare_threads(start_thread, end_thread); - } - else{ - prepare_threads(start_thread, m_config->n_thread_per_shader); - prepare_threads(0, end_thread); + // + warps.set(warp_id); } - assert(nthreads_in_block > 0 && nthreads_in_block <= m_config->n_thread_per_shader); // should be at least one, but @@ -1891,13 +1871,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { m_barriers.allocate_barrier(free_cta_hw_id, warps); // initialize the SIMT stacks and fetch hardware - if(!wrap_around_happens){ - init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel); - } - else{ - init_warps(free_cta_hw_id, start_thread, m_config->n_thread_per_shader, ctaid, cta_size, kernel); - init_warps(free_cta_hw_id, 0, end_thread, ctaid, cta_size, kernel); - } + init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel); m_n_active_cta++; diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index de69ef8ce..c14c8957e 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -36,6 +36,7 @@ #include #include #include +#include #include "../abstract_hardware_model.h" #include "../option_parser.h" #include "../trace.h" @@ -735,4 +736,40 @@ class exec_gpgpu_sim : public gpgpu_sim { virtual void createSIMTCluster(); }; +/** + * @brief Generates a constant vector of indices starting from start_index (inclusive), + * ending at end_index (exclusive), and may wrap around at the value specified by + * wrap_around_threshold. The sequence restarts at 0 (inclusive) after wrapping. + * + * E.g. a 3-tuple of arguments (start_index=7, end_index=3, wrap_around_threshold=10) + * will generate the following vector: {7, 8, 9, 0, 1, 2} + * + * @param start_index + * @param end_index + * @param wrap_around_threshold This value is non-reachable by the sequence. + * @return A const vector of the indices specified by the 3-tuple + */ +template +const std::vector get_index_vector_from_range_with_wrap_around(T start_index, T end_index, T wrap_around_threshold){ + assert(start_index>=0); + assert(start_index0); + assert(end_index<=wrap_around_threshold); + + //how large this vector is gonna be? + unsigned int range_size = (end_index > start_index) ? (end_index - start_index) : (wrap_around_threshold - start_index + end_index); + + std::vector vec; + vec.reserve(range_size); + + T index=start_index; + while(index!=end_index){ + if(index>=wrap_around_threshold){index=0;} + vec.push_back(index); + ++index; + } + + return vec; +} + #endif diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 67c09a28d..74163a5db 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -519,19 +519,29 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, m_occupied_cta_to_hwtid.clear(); m_active_warps = 0; } - for (unsigned i = start_thread; i < end_thread; i++) { + + auto tids = get_index_vector_from_range_with_wrap_around + (stard_thread, end_thread, m_config->n_thread_per_shader); + for (unsigned i : tids) { m_threadState[i].n_insn = 0; m_threadState[i].m_cta_id = -1; } const unsigned start_warp = start_thread / m_config->warp_size; const unsigned end_warp = end_thread / m_config->warp_size + ((end_thread % m_config->warp_size) ? 1 : 0); - for (unsigned i = start_warp; i < end_warp; ++i) { + + auto warp_ids = get_index_vector_from_range_with_wrap_around + (stard_warp, end_warp, m_config->max_warps_per_shader); + for (unsigned i : warp_ids) { m_warp[i]->reset(); m_simt_stack[i]->reset(); } } +/** + * @brief Note: To handle the case of hwtid wrap-around (end_thread < start_thread), + * this method will generate a const vec of warp ids to iterate over in a range-based for loop. + */ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned end_thread, unsigned ctaid, int cta_size, kernel_info_t &kernel) { @@ -544,7 +554,11 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned warp_per_cta = cta_size / m_config->warp_size; unsigned end_warp = end_thread / m_config->warp_size + ((end_thread % m_config->warp_size) ? 1 : 0); - for (unsigned i = start_warp; i < end_warp; ++i) { + + auto warp_ids = get_index_vector_from_range_with_wrap_around + (start_warp, end_warp, m_config->max_warps_per_shader); + + for (unsigned i : warp_ids) { unsigned n_active = 0; simt_mask_t active_threads; for (unsigned t = 0; t < m_config->warp_size; t++) { From 3eee55ad6be593b1b901302d140003cd15f6b97c Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 13:47:17 -0400 Subject: [PATCH 10/18] fixed some typos; also fixed wrong new[] call --- src/gpgpu-sim/shader.cc | 4 ++-- src/gpgpu-sim/shader.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 74163a5db..851875e65 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -521,7 +521,7 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, } auto tids = get_index_vector_from_range_with_wrap_around - (stard_thread, end_thread, m_config->n_thread_per_shader); + (start_thread, end_thread, m_config->n_thread_per_shader); for (unsigned i : tids) { m_threadState[i].n_insn = 0; m_threadState[i].m_cta_id = -1; @@ -531,7 +531,7 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, ((end_thread % m_config->warp_size) ? 1 : 0); auto warp_ids = get_index_vector_from_range_with_wrap_around - (stard_warp, end_warp, m_config->max_warps_per_shader); + (start_warp, end_warp, m_config->max_warps_per_shader); for (unsigned i : warp_ids) { m_warp[i]->reset(); m_simt_stack[i]->reset(); diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h index 8ef30c969..155f479c3 100644 --- a/src/gpgpu-sim/shader.h +++ b/src/gpgpu-sim/shader.h @@ -957,7 +957,7 @@ class opndcoll_rfu_t { // operand collector based register file unit m_sub_core_model = sub_core_model; m_num_warp_scheds = num_warp_scheds; if (m_sub_core_model) { - m_last_cu_set = new unsigned(m_num_warp_scheds); + m_last_cu_set = new unsigned[m_num_warp_scheds]; for (unsigned i = 0; i < m_num_warp_scheds; i++) { m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds; From dc18c0be4bf8749c27e981461a8f8279ad451062 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 13:57:36 -0400 Subject: [PATCH 11/18] fixed some typo --- src/gpgpu-sim/gpu-sim.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 6c3cddef8..9df76e5ec 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1829,7 +1829,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { int threads_left = cta_size; auto tids = get_index_vector_from_range_with_wrap_around - (start_thread, end_thread, m_config->max_warps_per_shader); + (start_thread, end_thread, m_config->n_thread_per_shader); for (unsigned i : tids) { m_threadState[i].m_cta_id = free_cta_hw_id; unsigned warp_id = i / m_config->warp_size; From c1750eafdb21ed1baa04ba5d22502cb0499867f6 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 23:25:16 -0400 Subject: [PATCH 12/18] use functional programming to address the id wrap-around issue --- src/gpgpu-sim/gpu-sim.cc | 19 +++-- src/gpgpu-sim/gpu-sim.h | 148 +++++++++++++++++++++++++++++++-------- src/gpgpu-sim/shader.cc | 30 ++++---- 3 files changed, 144 insertions(+), 53 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 9df76e5ec..136ed0261 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1708,12 +1708,17 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, assert(m_occupied_n_threads >= padded_cta_size); m_occupied_n_threads -= padded_cta_size; - int start_thread = m_occupied_cta_to_hwtid[hw_ctaid]; + unsigned start_thread = m_occupied_cta_to_hwtid[hw_ctaid]; + unsigned end_thread = ((start_thread + padded_cta_size) - 1) % m_config->n_thread_per_shader + 1; + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid()); - for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size; - hwtid++) + + tid_range.loop([&](const unsigned hwtid){ m_occupied_hwtid.reset(hwtid); + }); + m_occupied_cta_to_hwtid.erase(hw_ctaid); const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); @@ -1828,9 +1833,8 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { //used to pass in as the "threads_left" argument passed to sim_init_thread int threads_left = cta_size; - auto tids = get_index_vector_from_range_with_wrap_around - (start_thread, end_thread, m_config->n_thread_per_shader); - for (unsigned i : tids) { + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + tid_range.loop([&](const unsigned i){ m_threadState[i].m_cta_id = free_cta_hw_id; unsigned warp_id = i / m_config->warp_size; nthreads_in_block += sim_init_thread( @@ -1852,7 +1856,8 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { } // warps.set(warp_id); - } + }); + assert(nthreads_in_block > 0 && nthreads_in_block <= m_config->n_thread_per_shader); // should be at least one, but diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index c14c8957e..acb63b4b3 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -1,17 +1,18 @@ -// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas -// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers -// The University of British Columbia, Northwestern University, Purdue University +// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, +// Nikos Hardavellas Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The +// University of British Columbia, Northwestern University, Purdue University // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // -// 1. Redistributions of source code must retain the above copyright notice, this +// 1. Redistributions of source code must retain the above copyright notice, +// this // list of conditions and the following disclaimer; // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution; -// 3. Neither the names of The University of British Columbia, Northwestern +// 3. Neither the names of The University of British Columbia, Northwestern // University nor the names of their contributors may be used to // endorse or promote products derived from this software without specific // prior written permission. @@ -28,7 +29,6 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. - #ifndef GPU_SIM_H #define GPU_SIM_H @@ -73,7 +73,7 @@ extern tr1_hash_map address_random_interleaving; enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 }; enum hw_perf_t { - HW_BENCH_NAME=0, + HW_BENCH_NAME = 0, HW_KERNEL_NAME, HW_L1_RH, HW_L1_RM, @@ -109,7 +109,7 @@ struct power_config { s++; } char buf1[1024]; - //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date); + // snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date); snprintf(buf1, 1024, "accelwattch_power_report.log"); g_power_filename = strdup(buf1); char buf2[1024]; @@ -156,7 +156,6 @@ struct power_config { double gpu_steady_power_deviation; double gpu_steady_min_period; - char *g_hw_perf_file_name; char *g_hw_perf_bench_name; int g_power_simulation_mode; @@ -737,34 +736,42 @@ class exec_gpgpu_sim : public gpgpu_sim { }; /** - * @brief Generates a constant vector of indices starting from start_index (inclusive), - * ending at end_index (exclusive), and may wrap around at the value specified by - * wrap_around_threshold. The sequence restarts at 0 (inclusive) after wrapping. - * - * E.g. a 3-tuple of arguments (start_index=7, end_index=3, wrap_around_threshold=10) - * will generate the following vector: {7, 8, 9, 0, 1, 2} - * - * @param start_index + * @brief Generates a constant vector of indices starting from start_index + * (inclusive), ending at end_index (exclusive), and may wrap around at the + * value specified by wrap_around_threshold. The sequence restarts at 0 + * (inclusive) after wrapping. + * + * E.g. a 3-tuple of arguments (start_index=7, end_index=3, + * wrap_around_threshold=10) will generate the following vector: {7, 8, 9, 0, 1, + * 2} + * + * @param start_index * @param end_index * @param wrap_around_threshold This value is non-reachable by the sequence. * @return A const vector of the indices specified by the 3-tuple */ template -const std::vector get_index_vector_from_range_with_wrap_around(T start_index, T end_index, T wrap_around_threshold){ - assert(start_index>=0); - assert(start_index0); - assert(end_index<=wrap_around_threshold); - - //how large this vector is gonna be? - unsigned int range_size = (end_index > start_index) ? (end_index - start_index) : (wrap_around_threshold - start_index + end_index); - +const std::vector get_index_vector_from_range_with_wrap_around( + T start_index, T end_index, T wrap_around_threshold) { + assert(start_index >= 0); + assert(start_index < wrap_around_threshold); + assert(end_index > 0); + assert(end_index <= wrap_around_threshold); + + // how large this vector is gonna be? + unsigned int range_size = + (end_index > start_index) + ? (end_index - start_index) + : (wrap_around_threshold - start_index + end_index); + std::vector vec; vec.reserve(range_size); - T index=start_index; - while(index!=end_index){ - if(index>=wrap_around_threshold){index=0;} + T index = start_index; + while (index != end_index) { + if (index >= wrap_around_threshold) { + index = 0; + } vec.push_back(index); ++index; } @@ -772,4 +779,87 @@ const std::vector get_index_vector_from_range_with_wrap_around(T start_index, return vec; } +/** + * @brief Represents a range of unsigned indices that can wrap around + * at a certain threshold value. The functionality of this class is to + * provide a programmer-friendly and performant way to run a for loop over + * a range of indices that can potentially wrap over at the max value. + * + * This class comes in handy when looping over a range of hwtid and wrap_ids + * with subcore model in effect. Threads of a certain CTA may start mapping to + * the higher portion of the hwtid space and wrap around at the max thread id. + * E.g. Assuming max thread per SM is 2048, the CTA size is 128 threads, and + * the CTA's first thread maps to hwtid=2016, then the last thread shall map + * to hwtid=(2016 + 128) % 2048 - 1 = 95. Hence wrap-around. + * + * Hard-coding a for-loop that can detect wrap-arounds can make the code look + * complicated; populating an ordered-list of indices to iterate over is + * straightforward but both space- and time-inefficient. This class offers the + * benefit of functional programming by letting the programmer specify a + * lambda function to apply on each index within the specified range. + * + * The lambda function is required to take in one const unsigned argument and + * return void (i.e. std::function ). It is recommended + * the programmer use [&] to capture by-reference everything in the context, + * so as to mimic the effect of running a naked for-loop. + * + * E.g. if the original code was + * ``` + * //variables like a, b, c are in the scope + * for(unsigned int i=12; i<18; ++i){ + * //do things depending on value of i on a, b, and c + * } + * //use modified values of a, b, and c + * ``` + * + * then the code can look like this when using WrappableUnsignedRange: + * + * ``` + * //variables like a, b, c are in the scope + * WrappableUnsignedRange r(12, 18, 10000); + * r.loop( + * [&](const unsigned i){ + * //do things depending on value of i on a, b, and c + * } + * ); + * //use modified values of a, b, and c + * ``` + * + * Note: When start_index < end_index, the range of indices is [start, end) + * When start_index > end_index, the range is [start, wrapping_threshold) plus + * [0, end). + * When start_index == end_index, the range is empty. + * + * It is required that 0<=start loop_body_function) { + assert(start_index >= 0); + assert(start_index < wrapping_threshold); + assert(end_index > 0); + assert(end_index <= wrapping_threshold); + + unsigned index = start_index; + while (index != end_index) { + if (index >= wrapping_threshold) { + index = 0; + } + loop_body_function(index); + ++index; + } + } +}; + #endif diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 851875e65..ee6f4bb71 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -520,22 +520,20 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, m_active_warps = 0; } - auto tids = get_index_vector_from_range_with_wrap_around - (start_thread, end_thread, m_config->n_thread_per_shader); - for (unsigned i : tids) { + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + tid_range.loop([&](const unsigned i){ m_threadState[i].n_insn = 0; - m_threadState[i].m_cta_id = -1; - } + m_threadState[i].m_cta_id = -1; + }); + const unsigned start_warp = start_thread / m_config->warp_size; const unsigned end_warp = end_thread / m_config->warp_size + ((end_thread % m_config->warp_size) ? 1 : 0); - - auto warp_ids = get_index_vector_from_range_with_wrap_around - (start_warp, end_warp, m_config->max_warps_per_shader); - for (unsigned i : warp_ids) { + WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader); + warp_id_range.loop([&](const unsigned i){ m_warp[i]->reset(); - m_simt_stack[i]->reset(); - } + m_simt_stack[i]->reset(); + }); } /** @@ -555,10 +553,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned end_warp = end_thread / m_config->warp_size + ((end_thread % m_config->warp_size) ? 1 : 0); - auto warp_ids = get_index_vector_from_range_with_wrap_around - (start_warp, end_warp, m_config->max_warps_per_shader); - - for (unsigned i : warp_ids) { + WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader); + warp_id_range.loop([&](const unsigned i){ unsigned n_active = 0; simt_mask_t active_threads; for (unsigned t = 0; t < m_config->warp_size; t++) { @@ -592,8 +588,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id); ++m_dynamic_warp_id; m_not_completed += n_active; - ++m_active_warps; - } + ++m_active_warps; + }); } } From 797d7c8a6a68593c4568e41850d6e07b069e2826 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 23:41:21 -0400 Subject: [PATCH 13/18] include header file to support functional programming --- src/gpgpu-sim/gpu-sim.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index acb63b4b3..ecd334f5b 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -37,6 +37,7 @@ #include #include #include +#include #include "../abstract_hardware_model.h" #include "../option_parser.h" #include "../trace.h" From 20f909471bd502c3f514cb623e27b56a8e45034a Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Fri, 1 Apr 2022 17:26:29 -0400 Subject: [PATCH 14/18] added more utility functions to WrappableUnsignedRange --- src/gpgpu-sim/gpu-sim.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index ecd334f5b..5b88801b9 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -829,7 +829,7 @@ const std::vector get_index_vector_from_range_with_wrap_around( * Note: When start_index < end_index, the range of indices is [start, end) * When start_index > end_index, the range is [start, wrapping_threshold) plus * [0, end). - * When start_index == end_index, the range is empty. + * When start_index == end_index, the range is considered empty. * * It is required that 0<=start loop_body_function) { assert(start_index >= 0); assert(start_index < wrapping_threshold); From 2db6b5a21a840ab6bad0e8c055c86bd5ee474878 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Fri, 1 Apr 2022 17:28:30 -0400 Subject: [PATCH 15/18] shader_core_ctx can now handle cases when hwtid is coming from a wrapped range of thread ids, thanks to the glamorous WrappableUnsignedRange --- src/gpgpu-sim/shader.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index ee6f4bb71..6b5813ab3 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -553,13 +553,15 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned end_warp = end_thread / m_config->warp_size + ((end_thread % m_config->warp_size) ? 1 : 0); + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader); - warp_id_range.loop([&](const unsigned i){ + + warp_id_range.loop([&](const unsigned i){ unsigned n_active = 0; simt_mask_t active_threads; for (unsigned t = 0; t < m_config->warp_size; t++) { unsigned hwtid = i * m_config->warp_size + t; - if (hwtid < end_thread) { + if ( tid_range.contains(hwtid) ) { n_active++; assert(!m_active_threads.test(hwtid)); m_active_threads.set(hwtid); From d98b8f1dd76e4d6fa8901f14cc90a2f7690a5574 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Fri, 1 Apr 2022 17:32:41 -0400 Subject: [PATCH 16/18] fixed a not-so-glamorous bug --- src/gpgpu-sim/gpu-sim.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index 5b88801b9..605a7625d 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -846,7 +846,7 @@ class WrappableUnsignedRange { wrapping_threshold(_wrapping_threshold) {} bool isWrapped() { - return end_index < start_index + return end_index < start_index; } bool isWithinRange(const unsigned v){ From 7e9124b119c5c88e22f1487234ada089ccf5b650 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Fri, 1 Apr 2022 21:20:15 -0400 Subject: [PATCH 17/18] remove code for a function that was added while discovering ways to make subcore round robin scheduling work, but is no longer relevant --- src/gpgpu-sim/gpu-sim.h | 43 ----------------------------------------- 1 file changed, 43 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index 605a7625d..7b1b38c85 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -736,49 +736,6 @@ class exec_gpgpu_sim : public gpgpu_sim { virtual void createSIMTCluster(); }; -/** - * @brief Generates a constant vector of indices starting from start_index - * (inclusive), ending at end_index (exclusive), and may wrap around at the - * value specified by wrap_around_threshold. The sequence restarts at 0 - * (inclusive) after wrapping. - * - * E.g. a 3-tuple of arguments (start_index=7, end_index=3, - * wrap_around_threshold=10) will generate the following vector: {7, 8, 9, 0, 1, - * 2} - * - * @param start_index - * @param end_index - * @param wrap_around_threshold This value is non-reachable by the sequence. - * @return A const vector of the indices specified by the 3-tuple - */ -template -const std::vector get_index_vector_from_range_with_wrap_around( - T start_index, T end_index, T wrap_around_threshold) { - assert(start_index >= 0); - assert(start_index < wrap_around_threshold); - assert(end_index > 0); - assert(end_index <= wrap_around_threshold); - - // how large this vector is gonna be? - unsigned int range_size = - (end_index > start_index) - ? (end_index - start_index) - : (wrap_around_threshold - start_index + end_index); - - std::vector vec; - vec.reserve(range_size); - - T index = start_index; - while (index != end_index) { - if (index >= wrap_around_threshold) { - index = 0; - } - vec.push_back(index); - ++index; - } - - return vec; -} /** * @brief Represents a range of unsigned indices that can wrap around From a8a6ace77c04e0c0e4ccdc7051af2c57aaae8a84 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Sun, 3 Apr 2022 13:09:21 -0400 Subject: [PATCH 18/18] disabled SUBCORE debug trace --- configs/tested-cfgs/SM7_QV100/gpgpusim.config | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index 24e258390..8d2b10199 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -234,8 +234,4 @@ # tracing functionality #-trace_enabled 1 #-trace_components WARP_SCHEDULER,SCOREBOARD -#-trace_sampling_core 0 - --trace_enabled 1 --trace_components SUBCORE --trace_sampling_core -1 \ No newline at end of file +#-trace_sampling_core 0 \ No newline at end of file