From 27b754cef3d6a2c12f4cefeb1c22c6067db45894 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Mon, 21 Mar 2022 16:45:03 -0400
Subject: [PATCH 01/18] end_warp's calculation was not totally accurate

---
 src/gpgpu-sim/shader.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 814311d1c..7b5665ba1 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -522,8 +522,10 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
     m_threadState[i].n_insn = 0;
     m_threadState[i].m_cta_id = -1;
   }
-  for (unsigned i = start_thread / m_config->warp_size;
-       i < end_thread / m_config->warp_size; ++i) {
+  const unsigned start_warp = start_thread / m_config->warp_size;
+  const unsigned end_warp = end_thread / m_config->warp_size +
+                      ((end_thread % m_config->warp_size) ? 1 : 0);
+  for (unsigned i = start_warp; i < end_warp; ++i) {
     m_warp[i]->reset();
     m_simt_stack[i]->reset();
   }

From a8a89d107fe372759c1db42758a0c2172cd91670 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Mon, 21 Mar 2022 16:45:59 -0400
Subject: [PATCH 02/18] the way how hwtid is assigned looks subobtimal. Is this
 the way how NVIDIA's GPUs work?

---
 src/gpgpu-sim/gpu-sim.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 5af244b33..0a0150544 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1586,6 +1586,9 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
   }
 }
 
+//confusion: Seems like this function is seeking a contiguous range of hwtid that starts 
+//from an integer multiple of cta_size. This can leave holes in the range of hwtids. 
+//Is this overly restrictive?  
 int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
   unsigned int step;
   for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {

From 20f252b508c8cbd27406fa86da53371eb12fb766 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Tue, 22 Mar 2022 22:14:22 -0400
Subject: [PATCH 03/18] added some documentation for core functions relavant to
 subcore scheduling

---
 src/gpgpu-sim/gpu-sim.cc | 12 ++++++++++++
 src/gpgpu-sim/shader.cc  | 17 ++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 0a0150544..dafebce5e 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1589,6 +1589,15 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
 //confusion: Seems like this function is seeking a contiguous range of hwtid that starts 
 //from an integer multiple of cta_size. This can leave holes in the range of hwtids. 
 //Is this overly restrictive?  
+/**
+ * @brief Tries to find a contiguous range of available {hw_tid}s (and mark them as occupied). 
+ * 
+ * @param cta_size How many threads this CTA contains. Should already be
+ * "padded" to an integer multiple of the max warp size (m_config->warp_size)
+ * @param occupy Set to false for a dry run 
+ * @return -1 if a contiguous range that can fit all threads of this cta
+ * cannot be found, otherwise the hw_tid to which the first thread of this cta maps 
+ */
 int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
   unsigned int step;
   for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {
@@ -1706,6 +1715,9 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   if (!m_config->gpgpu_concurrent_kernel_sm)
     set_max_cta(kernel);
   else
+    //shader_core_ctx::can_issue_1block should have already verified that one block
+    //is indeed issueable on this shader core, therefore we expect 
+    //occupy_shader_resource_1block to return true here. 
     assert(occupy_shader_resource_1block(kernel, true));
 
   kernel.inc_running();
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 7b5665ba1..33b1def13 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -534,7 +534,8 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
 void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
                                  unsigned end_thread, unsigned ctaid,
                                  int cta_size, kernel_info_t &kernel) {
-  //
+  //when concurrent_sm is enabled, 
+  //both start_thread and end_thread are hwtid (0 <= x < n_thread_per_shader)
   address_type start_pc = next_pc(start_thread);
   unsigned kernel_id = kernel.get_uid();
   if (m_config->model == POST_DOMINATOR) {
@@ -3339,6 +3340,20 @@ void shader_core_ctx::display_pipeline(FILE *fout, int print_mem,
   }
 }
 
+/**
+ * @brief Given the resource requirements per CTA of a kernel, calculate how
+ * many such CTAs can a shader core sustain when it is "empty". In other words,
+ * it checks if the CTA is too "fat" to fit on a core; if it can, how many.
+ * 
+ * Although this function is declared to be const (promises not to modify any
+ * state of the shader_core_config class), it also checks if
+ * adaptive_cache_config is
+ * enabled and if yes, it might modify some states of the cache configuration.
+ * Read the code yourself if you are concerned!
+ * 
+ * @param k 
+ * @return unsigned int How many CTAs of the kernel can be sustained on a core.
+ */
 unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   unsigned threads_per_cta = k.threads_per_cta();
   const class function_info *kernel = k.entry();

From 005db44976e9abc3934f97445c0a271396afd424 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Sat, 26 Mar 2022 18:00:17 -0400
Subject: [PATCH 04/18] added debugging support for subcore modelling

---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |  6 ++++-
 src/gpgpu-sim/gpu-sim.cc                      | 23 +++++++++++++------
 src/gpgpu-sim/shader.cc                       |  1 +
 src/gpgpu-sim/shader.h                        |  5 ++--
 src/trace_streams.tup                         |  1 +
 5 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 8d2b10199..24e258390 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -234,4 +234,8 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
\ No newline at end of file
+#-trace_sampling_core 0
+
+-trace_enabled 1
+-trace_components SUBCORE
+-trace_sampling_core -1
\ No newline at end of file
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index dafebce5e..609ae0225 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1594,11 +1594,12 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
  * 
  * @param cta_size How many threads this CTA contains. Should already be
  * "padded" to an integer multiple of the max warp size (m_config->warp_size)
- * @param occupy Set to false for a dry run 
+ * @param occupy Set to "false" for a dry run 
  * @return -1 if a contiguous range that can fit all threads of this cta
  * cannot be found, otherwise the hw_tid to which the first thread of this cta maps 
  */
-int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
+int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) {
+  //TODO: use round robin based on dynamic_warp id; leave no gaps. 
   unsigned int step;
   for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {
     unsigned int hw_tid;
@@ -1608,12 +1609,15 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
     if (hw_tid == step + cta_size)  // consecutive non-active
       break;
   }
-  if (step >= m_config->n_thread_per_shader)  // didn't find
+  if (step >= m_config->n_thread_per_shader){  // didn't find
+    DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid());
     return -1;
+  }
   else {
     if (occupy) {
       for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++)
         m_occupied_hwtid.set(hw_tid);
+      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, step, step+cta_size-1, kernel.get_uid());
     }
     return step;
   }
@@ -1631,13 +1635,16 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
   if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader)
     return false;
 
-  if (find_available_hwtid(padded_cta_size, false) == -1) return false;
+  if (find_available_hwtid(padded_cta_size, k, false) == -1) return false;
 
   const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
 
   if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size)
     return false;
 
+  //TODO: check if each subcore has enough regs for this block
+  //this requires tracking the amount of available regs per subcore,
+  //plus knowning how many warps are to be issued on each subcore. 
   unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3);
   if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers)
     return false;
@@ -1661,7 +1668,7 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
 }
 
 void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
-                                                     kernel_info_t &k) {
+                                                     const kernel_info_t &k) {
   if (m_config->gpgpu_concurrent_kernel_sm) {
     unsigned threads_per_cta = k.threads_per_cta();
     const class function_info *kernel = k.entry();
@@ -1678,6 +1685,7 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
     for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size;
          hwtid++)
       m_occupied_hwtid.reset(hwtid);
+    DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid());
     m_occupied_cta_to_hwtid.erase(hw_ctaid);
 
     const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
@@ -1714,11 +1722,12 @@ unsigned exec_shader_core_ctx::sim_init_thread(
 void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   if (!m_config->gpgpu_concurrent_kernel_sm)
     set_max_cta(kernel);
-  else
+  else{
     //shader_core_ctx::can_issue_1block should have already verified that one block
     //is indeed issueable on this shader core, therefore we expect 
     //occupy_shader_resource_1block to return true here. 
     assert(occupy_shader_resource_1block(kernel, true));
+  }
 
   kernel.inc_running();
 
@@ -1755,7 +1764,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
     start_thread = free_cta_hw_id * padded_cta_size;
     end_thread = start_thread + cta_size;
   } else {
-    start_thread = find_available_hwtid(padded_cta_size, true);
+    start_thread = find_available_hwtid(padded_cta_size, kernel, true);
     assert((int)start_thread != -1);
     end_thread = start_thread + cta_size;
     assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) ==
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 33b1def13..67c09a28d 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -477,6 +477,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
                  config->max_barriers_per_cta, config->warp_size),
       m_active_warps(0),
       m_dynamic_warp_id(0) {
+  
   m_cluster = cluster;
   m_config = config;
   m_memory_config = mem_config;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index c3e6f93ed..8ef30c969 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -2514,8 +2514,8 @@ class shader_core_ctx : public core_t {
  public:
   bool can_issue_1block(kernel_info_t &kernel);
   bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy);
-  void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel);
-  int find_available_hwtid(unsigned int cta_size, bool occupy);
+  void release_shader_resource_1block(unsigned hw_ctaid, const kernel_info_t &kernel);
+  int find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy);
 
  private:
   unsigned int m_occupied_n_threads;
@@ -2559,6 +2559,7 @@ class exec_shader_core_ctx : public shader_core_ctx {
 };
 
 class simt_core_cluster {
+ friend class shader_core_ctx;
  public:
   simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
                     const shader_core_config *config,
diff --git a/src/trace_streams.tup b/src/trace_streams.tup
index 074c7c880..4457f6c25 100644
--- a/src/trace_streams.tup
+++ b/src/trace_streams.tup
@@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type )
     TS_TUP( MEMORY_SUBPARTITION_UNIT ),
     TS_TUP( INTERCONNECT ),
     TS_TUP( LIVENESS ),
+    TS_TUP( SUBCORE ),
     TS_TUP( NUM_TRACE_STREAMS )
 TS_TUP_END( trace_streams_type )

From 7bb066731d9fa73f07457f46f007f3e83ffca4c1 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Sat, 26 Mar 2022 18:00:17 -0400
Subject: [PATCH 05/18] added debugging support for subcore modelling

also added a const kernel_info_t& to the arg list of
shader_core_ctx::find_available_hwtid. This allows trace calls within
this function to be able to find the SM id.
---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |  6 ++++-
 src/gpgpu-sim/gpu-sim.cc                      | 23 +++++++++++++------
 src/gpgpu-sim/shader.cc                       |  1 +
 src/gpgpu-sim/shader.h                        |  5 ++--
 src/trace_streams.tup                         |  1 +
 5 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 8d2b10199..24e258390 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -234,4 +234,8 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
\ No newline at end of file
+#-trace_sampling_core 0
+
+-trace_enabled 1
+-trace_components SUBCORE
+-trace_sampling_core -1
\ No newline at end of file
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index dafebce5e..609ae0225 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1594,11 +1594,12 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
  * 
  * @param cta_size How many threads this CTA contains. Should already be
  * "padded" to an integer multiple of the max warp size (m_config->warp_size)
- * @param occupy Set to false for a dry run 
+ * @param occupy Set to "false" for a dry run 
  * @return -1 if a contiguous range that can fit all threads of this cta
  * cannot be found, otherwise the hw_tid to which the first thread of this cta maps 
  */
-int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
+int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) {
+  //TODO: use round robin based on dynamic_warp id; leave no gaps. 
   unsigned int step;
   for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {
     unsigned int hw_tid;
@@ -1608,12 +1609,15 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
     if (hw_tid == step + cta_size)  // consecutive non-active
       break;
   }
-  if (step >= m_config->n_thread_per_shader)  // didn't find
+  if (step >= m_config->n_thread_per_shader){  // didn't find
+    DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid());
     return -1;
+  }
   else {
     if (occupy) {
       for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++)
         m_occupied_hwtid.set(hw_tid);
+      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, step, step+cta_size-1, kernel.get_uid());
     }
     return step;
   }
@@ -1631,13 +1635,16 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
   if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader)
     return false;
 
-  if (find_available_hwtid(padded_cta_size, false) == -1) return false;
+  if (find_available_hwtid(padded_cta_size, k, false) == -1) return false;
 
   const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
 
   if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size)
     return false;
 
+  //TODO: check if each subcore has enough regs for this block
+  //this requires tracking the amount of available regs per subcore,
+  //plus knowning how many warps are to be issued on each subcore. 
   unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3);
   if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers)
     return false;
@@ -1661,7 +1668,7 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
 }
 
 void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
-                                                     kernel_info_t &k) {
+                                                     const kernel_info_t &k) {
   if (m_config->gpgpu_concurrent_kernel_sm) {
     unsigned threads_per_cta = k.threads_per_cta();
     const class function_info *kernel = k.entry();
@@ -1678,6 +1685,7 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
     for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size;
          hwtid++)
       m_occupied_hwtid.reset(hwtid);
+    DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid());
     m_occupied_cta_to_hwtid.erase(hw_ctaid);
 
     const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
@@ -1714,11 +1722,12 @@ unsigned exec_shader_core_ctx::sim_init_thread(
 void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   if (!m_config->gpgpu_concurrent_kernel_sm)
     set_max_cta(kernel);
-  else
+  else{
     //shader_core_ctx::can_issue_1block should have already verified that one block
     //is indeed issueable on this shader core, therefore we expect 
     //occupy_shader_resource_1block to return true here. 
     assert(occupy_shader_resource_1block(kernel, true));
+  }
 
   kernel.inc_running();
 
@@ -1755,7 +1764,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
     start_thread = free_cta_hw_id * padded_cta_size;
     end_thread = start_thread + cta_size;
   } else {
-    start_thread = find_available_hwtid(padded_cta_size, true);
+    start_thread = find_available_hwtid(padded_cta_size, kernel, true);
     assert((int)start_thread != -1);
     end_thread = start_thread + cta_size;
     assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) ==
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 33b1def13..67c09a28d 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -477,6 +477,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
                  config->max_barriers_per_cta, config->warp_size),
       m_active_warps(0),
       m_dynamic_warp_id(0) {
+  
   m_cluster = cluster;
   m_config = config;
   m_memory_config = mem_config;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index c3e6f93ed..8ef30c969 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -2514,8 +2514,8 @@ class shader_core_ctx : public core_t {
  public:
   bool can_issue_1block(kernel_info_t &kernel);
   bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy);
-  void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel);
-  int find_available_hwtid(unsigned int cta_size, bool occupy);
+  void release_shader_resource_1block(unsigned hw_ctaid, const kernel_info_t &kernel);
+  int find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy);
 
  private:
   unsigned int m_occupied_n_threads;
@@ -2559,6 +2559,7 @@ class exec_shader_core_ctx : public shader_core_ctx {
 };
 
 class simt_core_cluster {
+ friend class shader_core_ctx;
  public:
   simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
                     const shader_core_config *config,
diff --git a/src/trace_streams.tup b/src/trace_streams.tup
index 074c7c880..4457f6c25 100644
--- a/src/trace_streams.tup
+++ b/src/trace_streams.tup
@@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type )
     TS_TUP( MEMORY_SUBPARTITION_UNIT ),
     TS_TUP( INTERCONNECT ),
     TS_TUP( LIVENESS ),
+    TS_TUP( SUBCORE ),
     TS_TUP( NUM_TRACE_STREAMS )
 TS_TUP_END( trace_streams_type )

From 9601dfb89b3128d6d76c6c9f5e2bdf0a8ae533ad Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Tue, 29 Mar 2022 10:56:52 -0400
Subject: [PATCH 06/18] implemented RR in find_hwtid

---
 src/gpgpu-sim/gpu-sim.cc | 50 +++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 609ae0225..37ba6134d 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1599,15 +1599,34 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
  * cannot be found, otherwise the hw_tid to which the first thread of this cta maps 
  */
 int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) {
-  //TODO: use round robin based on dynamic_warp id; leave no gaps. 
-  unsigned int step;
-  for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {
-    unsigned int hw_tid;
-    for (hw_tid = step; hw_tid < step + cta_size; hw_tid++) {
-      if (m_occupied_hwtid.test(hw_tid)) break;
+  //TODO: use round robin based on dynamic_warp id
+  const unsigned int& warp_size = m_config->warp_size;
+
+  unsigned int step=0;
+  while(step < m_config->n_thread_per_shader) {
+    //Subcore experiments on Volta V100 
+    //show that warps are assigned to subcores in a Round-Robin fashion, 
+    //so we should start testing from the successor of the subcore 
+    //to which the last warp was assigned. 
+    
+    //Note: Warp ids are bound to a specific scheduler - which
+    //is equivalent to a subcore - based on (warp_id modulo # of schedulers) 
+
+    //m_dynamic_warp_id is incremented after a warp has been initiated,
+    //therefore we don't need to add one to find the "next" subcore 
+    //(ref: shader_core_ctx::init_warps)
+    unsigned int i;
+    for (i = step; i < step + cta_size; i++) {
+      unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
+      if (m_occupied_hwtid.test(hw_tid)) break; //break from this inner for-loop
+    }
+    if (i == step + cta_size)  // consecutive non-active
+      break; //break from the outer while-loop
+    else {
+      //start from the next warp slot
+      //e.g. if step was 32, i was 35, and warp_size is 32, then step will be updated to 64
+      step = (i / warp_size + 1) * warp_size;
     }
-    if (hw_tid == step + cta_size)  // consecutive non-active
-      break;
   }
   if (step >= m_config->n_thread_per_shader){  // didn't find
     DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid());
@@ -1615,11 +1634,13 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_in
   }
   else {
     if (occupy) {
-      for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++)
+      for (unsigned i = step; i < step + cta_size; i++){
+        unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
         m_occupied_hwtid.set(hw_tid);
-      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, step, step+cta_size-1, kernel.get_uid());
+      }
+      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid());
     }
-    return step;
+    return (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
   }
 }
 
@@ -1635,6 +1656,13 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
   if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader)
     return false;
 
+  //Even if the amount of available "thread slots" exceed our CTA size,
+  //if these slots are fragmented (non-continuous regions),
+  //we still might not be able to launch this CTA. 
+  //Obviously fragmentation can only happen on the granularity of warp size
+  //since hwtids are allocated on the granularity of warp_size. 
+  //It remains a TODO to find out if a CTA *can* launch when the warps of this CTA
+  //have no choice but map to non contiguous regions of hwtid.
   if (find_available_hwtid(padded_cta_size, k, false) == -1) return false;
 
   const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);

From 7bcda79823895e2007a29f2a213ccaadf8fa1d0f Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Tue, 29 Mar 2022 21:45:40 -0400
Subject: [PATCH 07/18] added DTRACE to sucore-related code; addressed
 wrap-around issue

---
 src/gpgpu-sim/gpu-sim.cc | 100 +++++++++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 30 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 37ba6134d..36db78d82 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1585,18 +1585,18 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
     return (get_n_active_cta() < m_config->max_cta(kernel));
   }
 }
-
-//confusion: Seems like this function is seeking a contiguous range of hwtid that starts 
-//from an integer multiple of cta_size. This can leave holes in the range of hwtids. 
-//Is this overly restrictive?  
+ 
 /**
  * @brief Tries to find a contiguous range of available {hw_tid}s (and mark them as occupied). 
+ * Wrap-arounds are allowed.
  * 
  * @param cta_size How many threads this CTA contains. Should already be
  * "padded" to an integer multiple of the max warp size (m_config->warp_size)
  * @param occupy Set to "false" for a dry run 
  * @return -1 if a contiguous range that can fit all threads of this cta
- * cannot be found, otherwise the hw_tid to which the first thread of this cta maps 
+ * cannot be found, otherwise the hw_tid to which the first thread of this cta maps. Note
+ * that since wrap-arounds can happen, naively adding cta_size to the retval - which is the
+ * start_thread - can result in a value exceeding the simulated hardware limits
  */
 int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) {
   //TODO: use round robin based on dynamic_warp id
@@ -1634,11 +1634,11 @@ int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_in
   }
   else {
     if (occupy) {
+      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid());
       for (unsigned i = step; i < step + cta_size; i++){
         unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
         m_occupied_hwtid.set(hw_tid);
       }
-      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid());
     }
     return (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
   }
@@ -1710,10 +1710,10 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
 
     int start_thread = m_occupied_cta_to_hwtid[hw_ctaid];
 
+    DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid());
     for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size;
          hwtid++)
       m_occupied_hwtid.reset(hwtid);
-    DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid());
     m_occupied_cta_to_hwtid.erase(hw_ctaid);
 
     const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
@@ -1791,18 +1791,34 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   if (!m_config->gpgpu_concurrent_kernel_sm) {
     start_thread = free_cta_hw_id * padded_cta_size;
     end_thread = start_thread + cta_size;
+    end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1;
   } else {
     start_thread = find_available_hwtid(padded_cta_size, kernel, true);
     assert((int)start_thread != -1);
     end_thread = start_thread + cta_size;
+    //It is necessary to perform a wrap-around. See impl. of find_available_hwtid.
+    end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1;
     assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) ==
            m_occupied_cta_to_hwtid.end());
     m_occupied_cta_to_hwtid[free_cta_hw_id] = start_thread;
   }
 
+  // A lot of legacy function that take in a range of thread ids
+  // are built upon the assumption that no wrap-around happens.
+  // However with the subcore model this is no longer true. We need
+  // to separately process the two regions of thread id if wrap-around happens
+  const bool wrap_around_happens = (end_thread < start_thread); 
+
   // reset the microarchitecture state of the selected hardware thread and warp
   // contexts
-  reinit(start_thread, end_thread, false);
+  if(!wrap_around_happens){
+    reinit(start_thread, end_thread, false);
+  }
+  else{
+    reinit(start_thread, m_config->n_thread_per_shader, false);
+    reinit(0, end_thread, false);
+  }
+  
 
   // initalize scalar threads and determine which hardware warps they are
   // allocated to bind functional simulation state of threads to hardware
@@ -1813,29 +1829,46 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   symbol_table *symtab = kernel_func_info->get_symtab();
   unsigned ctaid = kernel.get_next_cta_id_single();
   checkpoint *g_checkpoint = new checkpoint();
-  for (unsigned i = start_thread; i < end_thread; i++) {
-    m_threadState[i].m_cta_id = free_cta_hw_id;
-    unsigned warp_id = i / m_config->warp_size;
-    nthreads_in_block += sim_init_thread(
-        kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread),
-        m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id,
-        m_cluster->get_gpu());
-    m_threadState[i].m_active = true;
-    // load thread local memory and register file
-    if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel &&
-        ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) {
-      char fname[2048];
-      snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt",
-               i % cta_size, ctaid);
-      m_thread[i]->resume_reg_thread(fname, symtab);
-      char f1name[2048];
-      snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt",
-               i % cta_size, ctaid);
-      g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name);
+
+  // here is the definition of a lambda that faciliates the processing of 
+  // disjoint thread regions in the case of wrap-around
+  // Everything is captured by reference so any modification within the 
+  // lambda can affect the outer value being referenced
+  auto prepare_threads = [&](unsigned int _start_thread, unsigned int _end_thread) {
+    for (unsigned i = start_thread; i < end_thread; i++) {
+      m_threadState[i].m_cta_id = free_cta_hw_id;
+      unsigned warp_id = i / m_config->warp_size;
+      nthreads_in_block += sim_init_thread(
+          kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread),
+          m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id,
+          m_cluster->get_gpu());
+      m_threadState[i].m_active = true;
+      // load thread local memory and register file
+      if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel &&
+          ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) {
+        char fname[2048];
+        snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt",
+                i % cta_size, ctaid);
+        m_thread[i]->resume_reg_thread(fname, symtab);
+        char f1name[2048];
+        snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt",
+                i % cta_size, ctaid);
+        g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name);
+      }
+      //
+      warps.set(warp_id);
     }
-    //
-    warps.set(warp_id);
+  };
+
+  //the lambda is invoked here
+  if(!wrap_around_happens){
+    prepare_threads(start_thread, end_thread);
   }
+  else{
+    prepare_threads(start_thread, m_config->n_thread_per_shader);
+    prepare_threads(0, end_thread);
+  }
+
   assert(nthreads_in_block > 0 &&
          nthreads_in_block <=
              m_config->n_thread_per_shader);  // should be at least one, but
@@ -1854,7 +1887,14 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   m_barriers.allocate_barrier(free_cta_hw_id, warps);
 
   // initialize the SIMT stacks and fetch hardware
-  init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel);
+  if(!wrap_around_happens){
+    init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel);
+  }
+  else{
+    init_warps(free_cta_hw_id, start_thread, m_config->n_thread_per_shader, ctaid, cta_size, kernel);
+    init_warps(free_cta_hw_id, 0, end_thread, ctaid, cta_size, kernel);
+  }
+
   m_n_active_cta++;
 
   shader_CTA_count_log(m_sid, 1);

From caafcb3dd6d17222f4713b34343ea0990c9bbb62 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Tue, 29 Mar 2022 23:43:03 -0400
Subject: [PATCH 08/18] fixed coding mistake in previous commit

---
 src/gpgpu-sim/gpu-sim.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 36db78d82..bae63e121 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1830,16 +1830,20 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   unsigned ctaid = kernel.get_next_cta_id_single();
   checkpoint *g_checkpoint = new checkpoint();
 
-  // here is the definition of a lambda that faciliates the processing of 
-  // disjoint thread regions in the case of wrap-around
+  //used to pass in as the "threads_left" argument passed to sim_init_thread
+  int threads_left = cta_size; 
+
+  // Here is the definition of a lambda that faciliates the processing of 
+  // disjoint thread regions in the case of tid wrap-around.
   // Everything is captured by reference so any modification within the 
   // lambda can affect the outer value being referenced
+  // Note we are using a lambda-local _start_thread / _end_thread value
   auto prepare_threads = [&](unsigned int _start_thread, unsigned int _end_thread) {
-    for (unsigned i = start_thread; i < end_thread; i++) {
+    for (unsigned i = _start_thread; i < _end_thread; i++) {
       m_threadState[i].m_cta_id = free_cta_hw_id;
       unsigned warp_id = i / m_config->warp_size;
       nthreads_in_block += sim_init_thread(
-          kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread),
+          kernel, &m_thread[i], m_sid, i, cta_size--,
           m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id,
           m_cluster->get_gpu());
       m_threadState[i].m_active = true;

From cc3789ff37aad443ca7987a6d7adce834b2fceac Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Wed, 30 Mar 2022 13:40:13 -0400
Subject: [PATCH 09/18] use range-based for loop to iterate over hwtids which
 might have wrap-arounds due to subcore scheduling

Plus a utility function get_index_vector_from_range_with_wrap_around
used to generate the vector of indices.
---
 src/gpgpu-sim/gpu-sim.cc | 96 +++++++++++++++-------------------------
 src/gpgpu-sim/gpu-sim.h  | 37 ++++++++++++++++
 src/gpgpu-sim/shader.cc  | 20 +++++++--
 3 files changed, 89 insertions(+), 64 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index bae63e121..6c3cddef8 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1786,8 +1786,16 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
     padded_cta_size =
         ((cta_size / m_config->warp_size) + 1) * (m_config->warp_size);
 
-  unsigned int start_thread, end_thread;
 
+  //find available hwtids
+  // Note: A lot of legacy function that take in a range of thread ids
+  // are built upon the assumption that no wrap-around happens.
+  // However with the subcore model this is no longer true.     
+  // It is hence necessary to perform a wrap-around. 
+  // E.g. to demo the effect off wrap-around,if CTA size is 10, 
+  // n_thread_per_shader is 20 and start_thread is 18, end thread will 
+  // not be 28 but 8. 
+  unsigned int start_thread, end_thread;
   if (!m_config->gpgpu_concurrent_kernel_sm) {
     start_thread = free_cta_hw_id * padded_cta_size;
     end_thread = start_thread + cta_size;
@@ -1796,30 +1804,17 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
     start_thread = find_available_hwtid(padded_cta_size, kernel, true);
     assert((int)start_thread != -1);
     end_thread = start_thread + cta_size;
-    //It is necessary to perform a wrap-around. See impl. of find_available_hwtid.
+
     end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1;
     assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) ==
            m_occupied_cta_to_hwtid.end());
     m_occupied_cta_to_hwtid[free_cta_hw_id] = start_thread;
   }
 
-  // A lot of legacy function that take in a range of thread ids
-  // are built upon the assumption that no wrap-around happens.
-  // However with the subcore model this is no longer true. We need
-  // to separately process the two regions of thread id if wrap-around happens
-  const bool wrap_around_happens = (end_thread < start_thread); 
-
   // reset the microarchitecture state of the selected hardware thread and warp
   // contexts
-  if(!wrap_around_happens){
-    reinit(start_thread, end_thread, false);
-  }
-  else{
-    reinit(start_thread, m_config->n_thread_per_shader, false);
-    reinit(0, end_thread, false);
-  }
+  reinit(start_thread, end_thread, false);
   
-
   // initalize scalar threads and determine which hardware warps they are
   // allocated to bind functional simulation state of threads to hardware
   // resources (simulation)
@@ -1833,46 +1828,31 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   //used to pass in as the "threads_left" argument passed to sim_init_thread
   int threads_left = cta_size; 
 
-  // Here is the definition of a lambda that faciliates the processing of 
-  // disjoint thread regions in the case of tid wrap-around.
-  // Everything is captured by reference so any modification within the 
-  // lambda can affect the outer value being referenced
-  // Note we are using a lambda-local _start_thread / _end_thread value
-  auto prepare_threads = [&](unsigned int _start_thread, unsigned int _end_thread) {
-    for (unsigned i = _start_thread; i < _end_thread; i++) {
-      m_threadState[i].m_cta_id = free_cta_hw_id;
-      unsigned warp_id = i / m_config->warp_size;
-      nthreads_in_block += sim_init_thread(
-          kernel, &m_thread[i], m_sid, i, cta_size--,
-          m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id,
-          m_cluster->get_gpu());
-      m_threadState[i].m_active = true;
-      // load thread local memory and register file
-      if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel &&
-          ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) {
-        char fname[2048];
-        snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt",
-                i % cta_size, ctaid);
-        m_thread[i]->resume_reg_thread(fname, symtab);
-        char f1name[2048];
-        snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt",
-                i % cta_size, ctaid);
-        g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name);
-      }
-      //
-      warps.set(warp_id);
+  auto tids = get_index_vector_from_range_with_wrap_around<unsigned>
+    (start_thread, end_thread, m_config->max_warps_per_shader);
+  for (unsigned i : tids) {
+    m_threadState[i].m_cta_id = free_cta_hw_id;
+    unsigned warp_id = i / m_config->warp_size;
+    nthreads_in_block += sim_init_thread(
+        kernel, &m_thread[i], m_sid, i, threads_left--,
+        m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id,
+        m_cluster->get_gpu());
+    m_threadState[i].m_active = true;
+    // load thread local memory and register file
+    if (m_gpu->resume_option == 1 && kernel.get_uid() == m_gpu->resume_kernel &&
+        ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) {
+      char fname[2048];
+      snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt",
+              i % cta_size, ctaid);
+      m_thread[i]->resume_reg_thread(fname, symtab);
+      char f1name[2048];
+      snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt",
+              i % cta_size, ctaid);
+      g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name);
     }
-  };
-
-  //the lambda is invoked here
-  if(!wrap_around_happens){
-    prepare_threads(start_thread, end_thread);
-  }
-  else{
-    prepare_threads(start_thread, m_config->n_thread_per_shader);
-    prepare_threads(0, end_thread);
+    //
+    warps.set(warp_id);
   }
-
   assert(nthreads_in_block > 0 &&
          nthreads_in_block <=
              m_config->n_thread_per_shader);  // should be at least one, but
@@ -1891,13 +1871,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   m_barriers.allocate_barrier(free_cta_hw_id, warps);
 
   // initialize the SIMT stacks and fetch hardware
-  if(!wrap_around_happens){
-    init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel);
-  }
-  else{
-    init_warps(free_cta_hw_id, start_thread, m_config->n_thread_per_shader, ctaid, cta_size, kernel);
-    init_warps(free_cta_hw_id, 0, end_thread, ctaid, cta_size, kernel);
-  }
+  init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel);
 
   m_n_active_cta++;
 
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index de69ef8ce..c14c8957e 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -36,6 +36,7 @@
 #include <fstream>
 #include <iostream>
 #include <list>
+#include <vector>
 #include "../abstract_hardware_model.h"
 #include "../option_parser.h"
 #include "../trace.h"
@@ -735,4 +736,40 @@ class exec_gpgpu_sim : public gpgpu_sim {
   virtual void createSIMTCluster();
 };
 
+/**
+ * @brief Generates a constant vector of indices starting from start_index (inclusive), 
+ * ending at end_index (exclusive), and may wrap around at the value specified by 
+ * wrap_around_threshold. The sequence restarts at 0 (inclusive) after wrapping.
+ * 
+ * E.g. a 3-tuple of arguments (start_index=7, end_index=3, wrap_around_threshold=10) 
+ * will generate the following vector: {7, 8, 9, 0, 1, 2}
+ * 
+ * @param start_index 
+ * @param end_index
+ * @param wrap_around_threshold This value is non-reachable by the sequence.
+ * @return A const vector of the indices specified by the 3-tuple
+ */
+template <typename T>
+const std::vector<T> get_index_vector_from_range_with_wrap_around(T start_index, T end_index, T wrap_around_threshold){
+  assert(start_index>=0);
+  assert(start_index<wrap_around_threshold);
+  assert(end_index>0);
+  assert(end_index<=wrap_around_threshold);
+  
+  //how large this vector is gonna be?
+  unsigned int range_size = (end_index > start_index) ? (end_index - start_index) : (wrap_around_threshold - start_index + end_index);
+  
+  std::vector<T> vec;
+  vec.reserve(range_size);
+
+  T index=start_index;
+  while(index!=end_index){
+    if(index>=wrap_around_threshold){index=0;}
+    vec.push_back(index);
+    ++index;
+  }
+
+  return vec;
+}
+
 #endif
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 67c09a28d..74163a5db 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -519,19 +519,29 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
     m_occupied_cta_to_hwtid.clear();
     m_active_warps = 0;
   }
-  for (unsigned i = start_thread; i < end_thread; i++) {
+
+  auto tids = get_index_vector_from_range_with_wrap_around<unsigned>
+    (stard_thread, end_thread, m_config->n_thread_per_shader);
+  for (unsigned i : tids) {
     m_threadState[i].n_insn = 0;
     m_threadState[i].m_cta_id = -1;
   }
   const unsigned start_warp = start_thread / m_config->warp_size;
   const unsigned end_warp = end_thread / m_config->warp_size +
                       ((end_thread % m_config->warp_size) ? 1 : 0);
-  for (unsigned i = start_warp; i < end_warp; ++i) {
+
+  auto warp_ids = get_index_vector_from_range_with_wrap_around<unsigned>
+    (stard_warp, end_warp, m_config->max_warps_per_shader);               
+  for (unsigned i : warp_ids) {
     m_warp[i]->reset();
     m_simt_stack[i]->reset();
   }
 }
 
+/**
+ * @brief Note: To handle the case of hwtid wrap-around (end_thread < start_thread),
+ * this method will generate a const vec of warp ids to iterate over in a range-based for loop.  
+ */ 
 void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
                                  unsigned end_thread, unsigned ctaid,
                                  int cta_size, kernel_info_t &kernel) {
@@ -544,7 +554,11 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
     unsigned warp_per_cta = cta_size / m_config->warp_size;
     unsigned end_warp = end_thread / m_config->warp_size +
                         ((end_thread % m_config->warp_size) ? 1 : 0);
-    for (unsigned i = start_warp; i < end_warp; ++i) {
+
+    auto warp_ids = get_index_vector_from_range_with_wrap_around<unsigned>
+      (start_warp, end_warp, m_config->max_warps_per_shader);
+
+    for (unsigned i : warp_ids) {
       unsigned n_active = 0;
       simt_mask_t active_threads;
       for (unsigned t = 0; t < m_config->warp_size; t++) {

From 3eee55ad6be593b1b901302d140003cd15f6b97c Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Wed, 30 Mar 2022 13:47:17 -0400
Subject: [PATCH 10/18] fixed some typos; also fixed wrong new[] call

---
 src/gpgpu-sim/shader.cc | 4 ++--
 src/gpgpu-sim/shader.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 74163a5db..851875e65 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -521,7 +521,7 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
   }
 
   auto tids = get_index_vector_from_range_with_wrap_around<unsigned>
-    (stard_thread, end_thread, m_config->n_thread_per_shader);
+    (start_thread, end_thread, m_config->n_thread_per_shader);
   for (unsigned i : tids) {
     m_threadState[i].n_insn = 0;
     m_threadState[i].m_cta_id = -1;
@@ -531,7 +531,7 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
                       ((end_thread % m_config->warp_size) ? 1 : 0);
 
   auto warp_ids = get_index_vector_from_range_with_wrap_around<unsigned>
-    (stard_warp, end_warp, m_config->max_warps_per_shader);               
+    (start_warp, end_warp, m_config->max_warps_per_shader);               
   for (unsigned i : warp_ids) {
     m_warp[i]->reset();
     m_simt_stack[i]->reset();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 8ef30c969..155f479c3 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -957,7 +957,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_sub_core_model = sub_core_model;
       m_num_warp_scheds = num_warp_scheds;
       if (m_sub_core_model) {
-        m_last_cu_set = new unsigned(m_num_warp_scheds);
+        m_last_cu_set = new unsigned[m_num_warp_scheds];
         for (unsigned i = 0; i < m_num_warp_scheds; i++)
         {
           m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;

From dc18c0be4bf8749c27e981461a8f8279ad451062 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Wed, 30 Mar 2022 13:57:36 -0400
Subject: [PATCH 11/18] fixed some typo

---
 src/gpgpu-sim/gpu-sim.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 6c3cddef8..9df76e5ec 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1829,7 +1829,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   int threads_left = cta_size; 
 
   auto tids = get_index_vector_from_range_with_wrap_around<unsigned>
-    (start_thread, end_thread, m_config->max_warps_per_shader);
+    (start_thread, end_thread, m_config->n_thread_per_shader);
   for (unsigned i : tids) {
     m_threadState[i].m_cta_id = free_cta_hw_id;
     unsigned warp_id = i / m_config->warp_size;

From c1750eafdb21ed1baa04ba5d22502cb0499867f6 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Wed, 30 Mar 2022 23:25:16 -0400
Subject: [PATCH 12/18] use functional programming to address the id
 wrap-around issue

---
 src/gpgpu-sim/gpu-sim.cc |  19 +++--
 src/gpgpu-sim/gpu-sim.h  | 148 +++++++++++++++++++++++++++++++--------
 src/gpgpu-sim/shader.cc  |  30 ++++----
 3 files changed, 144 insertions(+), 53 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 9df76e5ec..136ed0261 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1708,12 +1708,17 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
     assert(m_occupied_n_threads >= padded_cta_size);
     m_occupied_n_threads -= padded_cta_size;
 
-    int start_thread = m_occupied_cta_to_hwtid[hw_ctaid];
+    unsigned start_thread = m_occupied_cta_to_hwtid[hw_ctaid];
+    unsigned end_thread = ((start_thread + padded_cta_size) - 1) % m_config->n_thread_per_shader + 1;
 
+    WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+  
     DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid());
-    for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size;
-         hwtid++)
+
+    tid_range.loop([&](const unsigned hwtid){
       m_occupied_hwtid.reset(hwtid);
+    });
+      
     m_occupied_cta_to_hwtid.erase(hw_ctaid);
 
     const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
@@ -1828,9 +1833,8 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   //used to pass in as the "threads_left" argument passed to sim_init_thread
   int threads_left = cta_size; 
 
-  auto tids = get_index_vector_from_range_with_wrap_around<unsigned>
-    (start_thread, end_thread, m_config->n_thread_per_shader);
-  for (unsigned i : tids) {
+  WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+  tid_range.loop([&](const unsigned i){
     m_threadState[i].m_cta_id = free_cta_hw_id;
     unsigned warp_id = i / m_config->warp_size;
     nthreads_in_block += sim_init_thread(
@@ -1852,7 +1856,8 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
     }
     //
     warps.set(warp_id);
-  }
+  });
+
   assert(nthreads_in_block > 0 &&
          nthreads_in_block <=
              m_config->n_thread_per_shader);  // should be at least one, but
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index c14c8957e..acb63b4b3 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah,
+// Nikos Hardavellas Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -28,7 +29,6 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
-
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
@@ -73,7 +73,7 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
 enum hw_perf_t {
-  HW_BENCH_NAME=0,
+  HW_BENCH_NAME = 0,
   HW_KERNEL_NAME,
   HW_L1_RH,
   HW_L1_RM,
@@ -109,7 +109,7 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    // snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
     snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
@@ -156,7 +156,6 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
-
   char *g_hw_perf_file_name;
   char *g_hw_perf_bench_name;
   int g_power_simulation_mode;
@@ -737,34 +736,42 @@ class exec_gpgpu_sim : public gpgpu_sim {
 };
 
 /**
- * @brief Generates a constant vector of indices starting from start_index (inclusive), 
- * ending at end_index (exclusive), and may wrap around at the value specified by 
- * wrap_around_threshold. The sequence restarts at 0 (inclusive) after wrapping.
- * 
- * E.g. a 3-tuple of arguments (start_index=7, end_index=3, wrap_around_threshold=10) 
- * will generate the following vector: {7, 8, 9, 0, 1, 2}
- * 
- * @param start_index 
+ * @brief Generates a constant vector of indices starting from start_index
+ * (inclusive), ending at end_index (exclusive), and may wrap around at the
+ * value specified by wrap_around_threshold. The sequence restarts at 0
+ * (inclusive) after wrapping.
+ *
+ * E.g. a 3-tuple of arguments (start_index=7, end_index=3,
+ * wrap_around_threshold=10) will generate the following vector: {7, 8, 9, 0, 1,
+ * 2}
+ *
+ * @param start_index
  * @param end_index
  * @param wrap_around_threshold This value is non-reachable by the sequence.
  * @return A const vector of the indices specified by the 3-tuple
  */
 template <typename T>
-const std::vector<T> get_index_vector_from_range_with_wrap_around(T start_index, T end_index, T wrap_around_threshold){
-  assert(start_index>=0);
-  assert(start_index<wrap_around_threshold);
-  assert(end_index>0);
-  assert(end_index<=wrap_around_threshold);
-  
-  //how large this vector is gonna be?
-  unsigned int range_size = (end_index > start_index) ? (end_index - start_index) : (wrap_around_threshold - start_index + end_index);
-  
+const std::vector<T> get_index_vector_from_range_with_wrap_around(
+    T start_index, T end_index, T wrap_around_threshold) {
+  assert(start_index >= 0);
+  assert(start_index < wrap_around_threshold);
+  assert(end_index > 0);
+  assert(end_index <= wrap_around_threshold);
+
+  // how large this vector is gonna be?
+  unsigned int range_size =
+      (end_index > start_index)
+          ? (end_index - start_index)
+          : (wrap_around_threshold - start_index + end_index);
+
   std::vector<T> vec;
   vec.reserve(range_size);
 
-  T index=start_index;
-  while(index!=end_index){
-    if(index>=wrap_around_threshold){index=0;}
+  T index = start_index;
+  while (index != end_index) {
+    if (index >= wrap_around_threshold) {
+      index = 0;
+    }
     vec.push_back(index);
     ++index;
   }
@@ -772,4 +779,87 @@ const std::vector<T> get_index_vector_from_range_with_wrap_around(T start_index,
   return vec;
 }
 
+/**
+ * @brief Represents a range of unsigned indices that can wrap around
+ * at a certain threshold value. The functionality of this class is to 
+ * provide a programmer-friendly and performant way to run a for loop over 
+ * a range of indices that can potentially wrap over at the max value. 
+ * 
+ * This class comes in handy when looping over a range of hwtid and wrap_ids 
+ * with subcore model in effect. Threads of a certain CTA may start mapping to 
+ * the higher portion of the hwtid space and wrap around at the max thread id.
+ * E.g. Assuming max thread per SM is 2048, the CTA size is 128 threads, and
+ * the CTA's first thread maps to hwtid=2016, then the last thread shall map
+ * to hwtid=(2016 + 128) % 2048 - 1 = 95. Hence wrap-around.
+ * 
+ * Hard-coding a for-loop that can detect wrap-arounds can make the code look 
+ * complicated; populating an ordered-list of indices to iterate over is 
+ * straightforward but both space- and time-inefficient. This class offers the 
+ * benefit of functional programming by letting the programmer specify a 
+ * lambda function to apply on each index within the specified range. 
+ * 
+ * The lambda function is required to take in one const unsigned argument and 
+ * return void (i.e. std::function<void(const unsigned)> ). It is recommended 
+ * the programmer use [&] to capture by-reference everything in the context, 
+ * so as to mimic the effect of running a naked for-loop. 
+ * 
+ * E.g. if the original code was 
+ * ```
+ * //variables like a, b, c are in the scope
+ * for(unsigned int i=12; i<18; ++i){
+ *  //do things depending on value of i on a, b, and c 
+ * }
+ * //use modified values of a, b, and c
+ * ```
+ * 
+ * then the code can look like this when using WrappableUnsignedRange:
+ * 
+ * ```
+ * //variables like a, b, c are in the scope
+ * WrappableUnsignedRange r(12, 18, 10000);
+ * r.loop(
+ *  [&](const unsigned i){
+ *    //do things depending on value of i on a, b, and c 
+ *  }
+ * );
+ *  //use modified values of a, b, and c 
+ * ```
+ * 
+ * Note: When start_index < end_index, the range of indices is [start, end)
+ * When start_index > end_index, the range is [start, wrapping_threshold) plus 
+ * [0, end).
+ * When start_index == end_index, the range is empty.
+ * 
+ * It is required that 0<=start<wrapping_thres, 0<end<=wrapping_thres
+ */
+class WrappableUnsignedRange {
+ public:
+  unsigned start_index;
+  unsigned end_index;
+  unsigned wrapping_threshold;
+
+  WrappableUnsignedRange(unsigned _start_index, unsigned _end_index,
+                         unsigned _wrapping_threshold)
+      : start_index(_start_index),
+        end_index(_end_index),
+        wrapping_threshold(_wrapping_threshold) {}
+  
+  //loop_body_function is called solely for its side-effect
+  void loop(std::function<void(const unsigned)> loop_body_function) {
+    assert(start_index >= 0);
+    assert(start_index < wrapping_threshold);
+    assert(end_index > 0);
+    assert(end_index <= wrapping_threshold);
+
+    unsigned index = start_index;
+    while (index != end_index) {
+      if (index >= wrapping_threshold) {
+        index = 0;
+      }
+      loop_body_function(index);
+      ++index;
+    }
+  }
+};
+
 #endif
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 851875e65..ee6f4bb71 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -520,22 +520,20 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
     m_active_warps = 0;
   }
 
-  auto tids = get_index_vector_from_range_with_wrap_around<unsigned>
-    (start_thread, end_thread, m_config->n_thread_per_shader);
-  for (unsigned i : tids) {
+  WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+  tid_range.loop([&](const unsigned i){
     m_threadState[i].n_insn = 0;
-    m_threadState[i].m_cta_id = -1;
-  }
+    m_threadState[i].m_cta_id = -1;   
+  });
+  
   const unsigned start_warp = start_thread / m_config->warp_size;
   const unsigned end_warp = end_thread / m_config->warp_size +
                       ((end_thread % m_config->warp_size) ? 1 : 0);
-
-  auto warp_ids = get_index_vector_from_range_with_wrap_around<unsigned>
-    (start_warp, end_warp, m_config->max_warps_per_shader);               
-  for (unsigned i : warp_ids) {
+  WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader);    
+  warp_id_range.loop([&](const unsigned i){
     m_warp[i]->reset();
-    m_simt_stack[i]->reset();
-  }
+    m_simt_stack[i]->reset();  
+  });
 }
 
 /**
@@ -555,10 +553,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
     unsigned end_warp = end_thread / m_config->warp_size +
                         ((end_thread % m_config->warp_size) ? 1 : 0);
 
-    auto warp_ids = get_index_vector_from_range_with_wrap_around<unsigned>
-      (start_warp, end_warp, m_config->max_warps_per_shader);
-
-    for (unsigned i : warp_ids) {
+    WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader);
+    warp_id_range.loop([&](const unsigned i){
       unsigned n_active = 0;
       simt_mask_t active_threads;
       for (unsigned t = 0; t < m_config->warp_size; t++) {
@@ -592,8 +588,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
       m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id);
       ++m_dynamic_warp_id;
       m_not_completed += n_active;
-      ++m_active_warps;
-    }
+      ++m_active_warps;      
+    });
   }
 }
 

From 797d7c8a6a68593c4568e41850d6e07b069e2826 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Wed, 30 Mar 2022 23:41:21 -0400
Subject: [PATCH 13/18] include header file to support functional programming

---
 src/gpgpu-sim/gpu-sim.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index acb63b4b3..ecd334f5b 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -37,6 +37,7 @@
 #include <iostream>
 #include <list>
 #include <vector>
+#include <functional>
 #include "../abstract_hardware_model.h"
 #include "../option_parser.h"
 #include "../trace.h"

From 20f909471bd502c3f514cb623e27b56a8e45034a Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Fri, 1 Apr 2022 17:26:29 -0400
Subject: [PATCH 14/18] added more utility functions to WrappableUnsignedRange

---
 src/gpgpu-sim/gpu-sim.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index ecd334f5b..5b88801b9 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -829,7 +829,7 @@ const std::vector<T> get_index_vector_from_range_with_wrap_around(
  * Note: When start_index < end_index, the range of indices is [start, end)
  * When start_index > end_index, the range is [start, wrapping_threshold) plus 
  * [0, end).
- * When start_index == end_index, the range is empty.
+ * When start_index == end_index, the range is considered empty.
  * 
  * It is required that 0<=start<wrapping_thres, 0<end<=wrapping_thres
  */
@@ -844,8 +844,26 @@ class WrappableUnsignedRange {
       : start_index(_start_index),
         end_index(_end_index),
         wrapping_threshold(_wrapping_threshold) {}
+
+  bool isWrapped() {
+    return end_index < start_index
+  }
+
+  bool isWithinRange(const unsigned v){
+    if(false==isWrapped() && start_index <= v && v<end_index) return true;
+    else if(isWrapped() && start_index <= v && v<wrapping_threshold) return true;
+    else if(isWrapped() && 0<=v && v<end_index) return true;
+    else return false;
+  }
+
+  bool contains(const unsigned v){
+    return isWithinRange(v);
+  }
   
-  //loop_body_function is called solely for its side-effect
+  // loop_body_function is called solely for its side-effect.
+  // To use this as a drop-in replacement for a naked for-loop,
+  // You might want to create a lambda that captures everything by-reference
+  // and pass that as the loop_body_function 
   void loop(std::function<void(const unsigned)> loop_body_function) {
     assert(start_index >= 0);
     assert(start_index < wrapping_threshold);

From 2db6b5a21a840ab6bad0e8c055c86bd5ee474878 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Fri, 1 Apr 2022 17:28:30 -0400
Subject: [PATCH 15/18] shader_core_ctx can now handle cases when hwtid is
 coming from a wrapped range of thread ids, thanks to the glamorous
 WrappableUnsignedRange

---
 src/gpgpu-sim/shader.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index ee6f4bb71..6b5813ab3 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -553,13 +553,15 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
     unsigned end_warp = end_thread / m_config->warp_size +
                         ((end_thread % m_config->warp_size) ? 1 : 0);
 
+    WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
     WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader);
-    warp_id_range.loop([&](const unsigned i){
+    
+    warp_id_range.loop([&](const unsigned i){ 
       unsigned n_active = 0;
       simt_mask_t active_threads;
       for (unsigned t = 0; t < m_config->warp_size; t++) {
         unsigned hwtid = i * m_config->warp_size + t;
-        if (hwtid < end_thread) {
+        if ( tid_range.contains(hwtid) ) {
           n_active++;
           assert(!m_active_threads.test(hwtid));
           m_active_threads.set(hwtid);

From d98b8f1dd76e4d6fa8901f14cc90a2f7690a5574 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Fri, 1 Apr 2022 17:32:41 -0400
Subject: [PATCH 16/18] fixed a not-so-glamorous bug

---
 src/gpgpu-sim/gpu-sim.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 5b88801b9..605a7625d 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -846,7 +846,7 @@ class WrappableUnsignedRange {
         wrapping_threshold(_wrapping_threshold) {}
 
   bool isWrapped() {
-    return end_index < start_index
+    return end_index < start_index;
   }
 
   bool isWithinRange(const unsigned v){

From 7e9124b119c5c88e22f1487234ada089ccf5b650 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Fri, 1 Apr 2022 21:20:15 -0400
Subject: [PATCH 17/18] remove code for a function that was added while
 discovering ways to make subcore round robin scheduling work, but is no
 longer relevant

---
 src/gpgpu-sim/gpu-sim.h | 43 -----------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 605a7625d..7b1b38c85 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -736,49 +736,6 @@ class exec_gpgpu_sim : public gpgpu_sim {
   virtual void createSIMTCluster();
 };
 
-/**
- * @brief Generates a constant vector of indices starting from start_index
- * (inclusive), ending at end_index (exclusive), and may wrap around at the
- * value specified by wrap_around_threshold. The sequence restarts at 0
- * (inclusive) after wrapping.
- *
- * E.g. a 3-tuple of arguments (start_index=7, end_index=3,
- * wrap_around_threshold=10) will generate the following vector: {7, 8, 9, 0, 1,
- * 2}
- *
- * @param start_index
- * @param end_index
- * @param wrap_around_threshold This value is non-reachable by the sequence.
- * @return A const vector of the indices specified by the 3-tuple
- */
-template <typename T>
-const std::vector<T> get_index_vector_from_range_with_wrap_around(
-    T start_index, T end_index, T wrap_around_threshold) {
-  assert(start_index >= 0);
-  assert(start_index < wrap_around_threshold);
-  assert(end_index > 0);
-  assert(end_index <= wrap_around_threshold);
-
-  // how large this vector is gonna be?
-  unsigned int range_size =
-      (end_index > start_index)
-          ? (end_index - start_index)
-          : (wrap_around_threshold - start_index + end_index);
-
-  std::vector<T> vec;
-  vec.reserve(range_size);
-
-  T index = start_index;
-  while (index != end_index) {
-    if (index >= wrap_around_threshold) {
-      index = 0;
-    }
-    vec.push_back(index);
-    ++index;
-  }
-
-  return vec;
-}
 
 /**
  * @brief Represents a range of unsigned indices that can wrap around

From a8a6ace77c04e0c0e4ccdc7051af2c57aaae8a84 Mon Sep 17 00:00:00 2001
From: Fangjia Shen <shen449@tgrogers-raid.ecn.purdue.edu>
Date: Sun, 3 Apr 2022 13:09:21 -0400
Subject: [PATCH 18/18] disabled SUBCORE debug trace

---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 24e258390..8d2b10199 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -234,8 +234,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
--trace_enabled 1
--trace_components SUBCORE
--trace_sampling_core -1
\ No newline at end of file
+#-trace_sampling_core 0
\ No newline at end of file